diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index afd0d9e7539ef..2cddc3365d5d7 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -768,9 +768,6 @@ For example: performant than code generated for XNACK replay disabled. - cu-stores TODO On GFX12.5, controls whether ``scope:SCOPE_CU`` stores may be used. - If disabled, all stores will be done at ``scope:SCOPE_SE`` or greater. - =============== ============================ ================================================== .. _amdgpu-target-id: @@ -5114,9 +5111,7 @@ The fields used by CP for code objects before V3 also match those specified in and must be 0, >454 1 bit ENABLE_SGPR_PRIVATE_SEGMENT _SIZE - 455 1 bit USES_CU_STORES GFX12.5: Whether the ``cu-stores`` target attribute is enabled. - If 0, then all stores are ``SCOPE_SE`` or higher. - 457:456 2 bits Reserved, must be 0. + 457:455 3 bits Reserved, must be 0. 458 1 bit ENABLE_WAVEFRONT_SIZE32 GFX6-GFX9 Reserved, must be 0. GFX10-GFX11 @@ -18254,8 +18249,6 @@ terminated by an ``.end_amdhsa_kernel`` directive. GFX942) ``.amdhsa_user_sgpr_private_segment_size`` 0 GFX6-GFX12 Controls ENABLE_SGPR_PRIVATE_SEGMENT_SIZE in :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. - ``.amdhsa_uses_cu_stores`` 0 GFX12.5 Controls USES_CU_STORES in - :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_wavefront_size32`` Target GFX10-GFX12 Controls ENABLE_WAVEFRONT_SIZE32 in Feature :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. Specific diff --git a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h index fb9d68428cf18..418d2b36114c5 100644 --- a/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +++ b/llvm/include/llvm/Support/AMDHSAKernelDescriptor.h @@ -252,8 +252,7 @@ enum : int32_t { KERNEL_CODE_PROPERTY(ENABLE_SGPR_DISPATCH_ID, 4, 1), KERNEL_CODE_PROPERTY(ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), KERNEL_CODE_PROPERTY(ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), - KERNEL_CODE_PROPERTY(RESERVED0, 7, 2), - KERNEL_CODE_PROPERTY(USES_CU_STORES, 9, 1), // GFX12.5 +cu-stores + KERNEL_CODE_PROPERTY(RESERVED0, 7, 3), KERNEL_CODE_PROPERTY(ENABLE_WAVEFRONT_SIZE32, 10, 1), // GFX10+ KERNEL_CODE_PROPERTY(USES_DYNAMIC_STACK, 11, 1), KERNEL_CODE_PROPERTY(RESERVED1, 12, 4), diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index a366db1c580ba..ffbda14dcd849 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -289,12 +289,6 @@ def FeatureSafeCUPrefetch : SubtargetFeature<"safe-cu-prefetch", "VMEM CU scope prefetches do not fail on illegal address" >; -def FeatureCUStores : SubtargetFeature<"cu-stores", - "HasCUStores", - "true", - "Whether SCOPE_CU stores can be used on GFX12.5" ->; - def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", "HasVcmpxExecWARHazard", "true", @@ -2042,7 +2036,6 @@ def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, FeatureRequiresAlignedVGPRs, - FeatureCUStores, FeatureAddressableLocalMemorySize327680, FeatureCuMode, Feature1024AddressableVGPRs, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 1fff188c75819..29f8f9bc8b54c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -557,7 +557,6 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( MCContext &Ctx = MF.getContext(); uint16_t KernelCodeProperties = 0; const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); - const GCNSubtarget &ST = MF.getSubtarget(); if (UserSGPRInfo.hasPrivateSegmentBuffer()) { KernelCodeProperties |= @@ -587,13 +586,10 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; } - if (ST.isWave32()) { + if (MF.getSubtarget().isWave32()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; } - if (isGFX1250(ST) && ST.hasCUStores()) { - KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES; - } // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be // un-evaluatable at this point so it cannot be conditionally checked here. diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2ce1e9e410b23..e420f2ad676f9 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -6181,12 +6181,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { ExprVal, ValRange); if (Val) ImpliedUserSGPRCount += 1; - } else if (ID == ".amdhsa_uses_cu_stores") { - if (!isGFX1250()) - return Error(IDRange.Start, "directive requires gfx12.5", IDRange); - - PARSE_BITS_ENTRY(KD.kernel_code_properties, - KERNEL_CODE_PROPERTY_USES_CU_STORES, ExprVal, ValRange); } else if (ID == ".amdhsa_wavefront_size32") { EXPR_RESOLVE_OR_ERROR(EvaluatableExpr); if (IVersion.Major < 10) diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index b50b2a2e6e23c..6f6039bf4ec21 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -2639,9 +2639,6 @@ Expected AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); - if (isGFX1250()) - PRINT_DIRECTIVE(".amdhsa_uses_cu_stores", - KERNEL_CODE_PROPERTY_USES_CU_STORES); if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0) return createReservedKDBitsError(KERNEL_CODE_PROPERTY_RESERVED0, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 4475c8d1d1602..e172a9c699fb1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -252,7 +252,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasVmemPrefInsts = false; bool HasSafeSmemPrefetch = false; bool HasSafeCUPrefetch = false; - bool HasCUStores = false; bool HasVcmpxExecWARHazard = false; bool HasLdsBranchVmemWARHazard = false; bool HasNSAtoVMEMBug = false; @@ -1017,8 +1016,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasSafeCUPrefetch() const { return HasSafeCUPrefetch; } - bool hasCUStores() const { return HasCUStores; } - // Has s_cmpk_* instructions. bool hasSCmpK() const { return getGeneration() < GFX12; } @@ -1835,6 +1832,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasScratchBaseForwardingHazard() const { return GFX1250Insts && getGeneration() == GFX12; } + + /// \returns true if the subtarget requires a wait for xcnt before atomic + /// flat/global stores & rmw. + bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 0bbab29dbda18..ff6a21239345d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -448,11 +448,6 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, ".amdhsa_user_sgpr_private_segment_size"); - if (isGFX1250(STI)) - PrintField(KD.kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES_SHIFT, - amdhsa::KERNEL_CODE_PROPERTY_USES_CU_STORES, - ".amdhsa_uses_cu_stores"); if (IVersion.Major >= 10) PrintField(KD.kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32_SHIFT, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index c964d02ee2b97..f7dde2b90b68e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1055,6 +1055,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return AMDGPU::S_WAIT_DSCNT; case AMDGPU::S_WAIT_KMCNT_soft: return AMDGPU::S_WAIT_KMCNT; + case AMDGPU::S_WAIT_XCNT_soft: + return AMDGPU::S_WAIT_XCNT; default: return Opcode; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 6ab4eb4bde97c..f61c0d8f84b29 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -606,7 +606,11 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; public: - SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} + SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { + // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases + // the behavior is the same if assuming GFX12.0 in CU mode. + assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled()); + } bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, @@ -2198,7 +2202,8 @@ bool SIGfx10CacheControl::insertBarrierStart( // mode. This is because a CU mode release fence does not emit any wait, which // is fine when only dealing with vmem, but isn't sufficient in the presence // of barriers which do not go through vmem. - if (!ST.isCuModeEnabled()) + // GFX12.5 does not require this additional wait. + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) return false; BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), @@ -2378,12 +2383,16 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, STORECnt |= true; break; case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore need to wait for operations to complete to ensure - // they are visible to waves in the other CU as the L0 is per CU. - // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. - if (!ST.isCuModeEnabled()) { + // GFX12.0: + // In WGP mode the waves of a work-group can be executing on either CU + // of the WGP. Therefore need to wait for operations to complete to + // ensure they are visible to waves in the other CU as the L0 is per CU. + // Otherwise in CU mode and all waves of a work-group are on the same CU + // which shares the same L0. + // + // GFX12.5: + // TODO DOCS + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2435,7 +2444,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // // This also applies to fences. Fences cannot pair with an instruction // tracked with bvh/samplecnt as we don't have any atomics that do that. - if (Order != AtomicOrdering::Acquire) { + if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); } @@ -2487,10 +2496,14 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, ScopeImm = AMDGPU::CPol::SCOPE_DEV; break; case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore we need to invalidate the L0 which is per CU. - // Otherwise in CU mode all waves of a work-group are on the same CU, and so - // the L0 does not need to be invalidated. + // GFX12.0: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore we need to invalidate the L0 which is per CU. + // Otherwise in CU mode all waves of a work-group are on the same CU, and + // so the L0 does not need to be invalidated. + // + // GFX12.5 + // TODO DOCS if (ST.isCuModeEnabled()) return false; @@ -2535,7 +2548,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - // global_wb is only necessary at system scope for gfx120x targets. + // global_wb is only necessary at system scope for GFX12.0, + // they're also necessary at device scope for GFX12.5. // // Emitting it for lower scopes is a slow no-op, so we omit it // for performance. @@ -2545,6 +2559,12 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, .addImm(AMDGPU::CPol::SCOPE_SYS); break; case SIAtomicScope::AGENT: + // TODO DOCS + if (ST.hasGFX1250Insts()) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) + .addImm(AMDGPU::CPol::SCOPE_DEV); + } + break; case SIAtomicScope::WORKGROUP: // No WB necessary, but we still have to wait. break; @@ -2607,27 +2627,40 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( } bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { - MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); - if (!CPol) - return false; + assert(MI.mayStore() && "Not a Store inst"); + const bool IsRMW = (MI.mayLoad() && MI.mayStore()); + bool Changed = false; + + // GFX12.5 only: xcnt wait is needed before flat and global atomics + // stores/rmw. + if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) { + MachineBasicBlock &MBB = *MI.getParent(); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); + Changed = true; + } + + // Remaining fixes do not apply to RMWs. + if (IsRMW) + return Changed; + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); + if (!CPol) // Some vmem operations do not have a scope and are not concerned. + return Changed; const unsigned Scope = CPol->getImm() & CPol::SCOPE; // GFX12.0 only: Extra waits needed before system scope stores. if (!ST.hasGFX1250Insts()) { if (!Atomic && Scope == CPol::SCOPE_SYS) return insertWaitsBeforeSystemScopeStore(MI); - return false; + return Changed; } // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address // space. - // We also require SCOPE_SE minimum if we not have the "cu-stores" feature. - if (Scope == CPol::SCOPE_CU && - (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) + if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) return setScope(MI, CPol::SCOPE_SE); - return false; + return Changed; } bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const { @@ -2839,6 +2872,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; + MachineInstr &RMWMI = *MI; if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); @@ -2873,6 +2907,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, Position::AFTER); } + Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true); return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index dbe0b8c496fed..12a27db241c4e 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1653,6 +1653,11 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } + +let SubtargetPredicate = HasWaitXcnt in { + def S_WAIT_XCNT_soft : SOPP_Pseudo<"", (ins s16imm:$simm16), "$simm16">; +} + // Represents the point at which a wave must wait for all outstanding direct loads to LDS. // Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 481a2540eacb7..e886ea4fc6ac6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1501,6 +1501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1571,6 +1572,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1645,6 +1649,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1715,6 +1720,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1792,6 +1800,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -1902,6 +1911,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1947,6 +1959,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1987,6 +2000,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2031,6 +2047,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2107,6 +2124,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2190,6 +2208,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2418,6 +2439,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll index 5fc9f4a0f8038..4bb2a13d02cc7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll @@ -364,6 +364,7 @@ define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -406,6 +407,7 @@ define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -448,6 +450,7 @@ define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -490,6 +493,7 @@ define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1344,6 +1348,7 @@ define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1386,6 +1391,7 @@ define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1428,6 +1434,7 @@ define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1470,6 +1477,7 @@ define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 3dedf008c917e..1bf37d512f845 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -10,6 +10,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -47,6 +49,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -85,6 +89,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -128,6 +134,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -166,6 +174,8 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -205,6 +215,8 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_neg2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -262,6 +274,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -276,6 +290,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -325,6 +341,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -339,6 +357,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -389,6 +409,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -402,6 +424,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -449,6 +473,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -462,6 +488,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 offset:42 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -532,6 +560,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB10_5 ; GFX1250-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -578,6 +608,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB10_5 ; GFX1250-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -712,6 +744,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB11_5 ; GFX1250-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -761,6 +795,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB11_5 ; GFX1250-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -896,6 +932,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -933,6 +971,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1044,6 +1084,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1084,6 +1126,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1186,6 +1230,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1224,6 +1270,8 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1269,6 +1317,8 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1305,6 +1355,8 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1368,6 +1420,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB18_5 ; GFX1250-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1414,6 +1468,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB18_5 ; GFX1250-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1552,6 +1608,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB19_5 ; GFX1250-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1601,6 +1659,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB19_5 ; GFX1250-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1740,6 +1800,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1780,6 +1842,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1902,6 +1966,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1945,6 +2011,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2058,6 +2126,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2096,6 +2166,8 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2141,6 +2213,8 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2177,6 +2251,8 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2240,6 +2316,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB26_5 ; GFX1250-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2286,6 +2364,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB26_5 ; GFX1250-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2426,6 +2506,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB27_5 ; GFX1250-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2475,6 +2557,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB27_5 ; GFX1250-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2616,6 +2700,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2656,6 +2742,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2780,6 +2868,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2823,6 +2913,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2938,6 +3030,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2976,6 +3070,8 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3021,6 +3117,8 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3057,6 +3155,8 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3120,6 +3220,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB34_5 ; GFX1250-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3167,6 +3269,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB34_5 ; GFX1250-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3306,6 +3410,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB35_5 ; GFX1250-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3356,6 +3462,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB35_5 ; GFX1250-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3496,6 +3604,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3537,6 +3647,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3660,6 +3772,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -3704,6 +3818,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -3818,6 +3934,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3856,6 +3974,8 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3 define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3901,6 +4021,8 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3937,6 +4059,8 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4000,6 +4124,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB42_5 ; GFX1250-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4047,6 +4173,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB42_5 ; GFX1250-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4186,6 +4314,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB43_5 ; GFX1250-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4236,6 +4366,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB43_5 ; GFX1250-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4376,6 +4508,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4417,6 +4551,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4540,6 +4676,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4584,6 +4722,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -4698,6 +4838,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4736,6 +4878,8 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4781,6 +4925,8 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4817,6 +4963,8 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4880,6 +5028,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB50_5 ; GFX1250-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -4927,6 +5077,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB50_5 ; GFX1250-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5066,6 +5218,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB51_5 ; GFX1250-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -5116,6 +5270,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB51_5 ; GFX1250-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5256,6 +5412,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -5297,6 +5455,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5420,6 +5580,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -5464,6 +5626,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -5650,7 +5814,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_max_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn: @@ -5681,7 +5845,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-LABEL: flat_max_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn_neg128: @@ -5735,20 +5899,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_4 ; GFX1250-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB58_5 ; GFX1250-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -5782,20 +5943,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_4 ; GFX1250-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB58_5 ; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -5923,20 +6081,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_4 ; GFX1250-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB59_5 ; GFX1250-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -5973,20 +6128,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_4 ; GFX1250-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB59_5 ; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -6119,9 +6271,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2 @@ -6158,9 +6310,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 @@ -6279,9 +6431,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2 @@ -6321,9 +6473,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 @@ -6504,7 +6656,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_min_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn: @@ -6535,7 +6687,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-LABEL: flat_min_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn_neg128: @@ -6589,20 +6741,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_4 ; GFX1250-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB66_5 ; GFX1250-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -6636,20 +6785,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_4 ; GFX1250-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB66_5 ; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -6777,20 +6923,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_4 ; GFX1250-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB67_5 ; GFX1250-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -6827,20 +6970,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_4 ; GFX1250-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB67_5 ; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -6973,9 +7113,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2 @@ -7012,9 +7152,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 @@ -7133,9 +7273,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2 @@ -7175,9 +7315,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 @@ -7358,7 +7498,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_umax_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn: @@ -7389,7 +7529,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-LABEL: flat_umax_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn_neg128: @@ -7443,20 +7583,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_4 ; GFX1250-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB74_5 ; GFX1250-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -7490,20 +7627,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_4 ; GFX1250-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB74_5 ; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -7631,20 +7765,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_4 ; GFX1250-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB75_5 ; GFX1250-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -7681,20 +7812,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_4 ; GFX1250-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB75_5 ; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -7827,9 +7955,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2 @@ -7866,9 +7994,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 @@ -7987,9 +8115,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2 @@ -8029,9 +8157,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 @@ -8212,7 +8340,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_umin_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn: @@ -8243,7 +8371,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-LABEL: flat_umin_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn_neg128: @@ -8297,20 +8425,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_4 ; GFX1250-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB82_5 ; GFX1250-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -8344,20 +8469,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_4 ; GFX1250-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB82_5 ; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -8485,20 +8607,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_4 ; GFX1250-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB83_5 ; GFX1250-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -8535,20 +8654,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_4 ; GFX1250-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB83_5 ; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -8681,9 +8797,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2 @@ -8720,9 +8836,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 @@ -8841,9 +8957,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2 @@ -8883,9 +8999,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 @@ -11743,7 +11859,9 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_4 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB110_5 ; GFX1250-SDAG-NEXT: s_branch .LBB110_6 ; GFX1250-SDAG-NEXT: .LBB110_3: @@ -11757,7 +11875,6 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1] @@ -11768,12 +11885,12 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB110_7: ; %atomicrmw.shared ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: .LBB110_8: ; %atomicrmw.end -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; @@ -11799,7 +11916,9 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: .LBB110_3: ; %Flow ; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -11811,7 +11930,6 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1] @@ -11827,12 +11945,12 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB110_8: ; %atomicrmw.end -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; @@ -11969,7 +12087,9 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: .LBB111_5: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_7 @@ -11991,6 +12111,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: ds_add_f64 v2, v[0:1] ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] @@ -12015,7 +12136,9 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: .LBB111_3: ; %Flow ; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12044,6 +12167,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: ds_add_f64 v2, v[0:1] ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB111_8: ; %atomicrmw.phi @@ -12162,20 +12286,19 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB112_2 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB112_3 ; GFX1250-SDAG-NEXT: s_branch .LBB112_4 ; GFX1250-SDAG-NEXT: .LBB112_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -12183,7 +12306,7 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE ; GFX1250-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; @@ -12203,10 +12326,10 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB112_2: ; %Flow -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 @@ -12217,7 +12340,6 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -12225,7 +12347,7 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE ; GFX1250-GISEL-NEXT: .LBB112_4: ; %atomicrmw.end -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; @@ -12322,8 +12444,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2 ; GFX1250-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo @@ -12355,8 +12478,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] offset:80 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB113_2: ; %Flow ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 @@ -12463,20 +12587,19 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB114_2 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB114_3 ; GFX1250-SDAG-NEXT: s_branch .LBB114_4 ; GFX1250-SDAG-NEXT: .LBB114_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -12484,7 +12607,7 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE ; GFX1250-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; @@ -12504,10 +12627,10 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB114_2: ; %Flow -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 @@ -12518,7 +12641,6 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] @@ -12526,7 +12648,7 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE ; GFX1250-GISEL-NEXT: .LBB114_4: ; %atomicrmw.end -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; @@ -12623,8 +12745,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2 ; GFX1250-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo @@ -12656,8 +12779,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] offset:80 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB115_2: ; %Flow ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 @@ -12754,6 +12878,7 @@ define float @flat_atomic_fadd_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -12785,8 +12910,9 @@ define void @flat_atomic_fadd_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v1, v0, s[0:1] offset:40 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fadd_f32_saddr_nortn: @@ -12816,6 +12942,7 @@ define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -12878,8 +13005,9 @@ define void @flat_atomic_fmax_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f32 v1, v0, s[0:1] offset:40 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmax_f32_saddr_nortn: @@ -12939,6 +13067,7 @@ define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -13001,8 +13130,9 @@ define void @flat_atomic_fmin_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f32 v1, v0, s[0:1] offset:40 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmin_f32_saddr_nortn: @@ -13062,6 +13192,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> % ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_f16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -13093,8 +13224,9 @@ define void @flat_atomic_fadd_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_f16 v1, v0, s[0:1] offset:40 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fadd_v2f16_saddr_nortn: @@ -13135,11 +13267,13 @@ define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> % ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 ; GFX1250-NEXT: v_pk_max_num_f16 v4, v0, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB124_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13215,10 +13349,11 @@ define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data ; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v3 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13298,11 +13433,13 @@ define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> % ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 ; GFX1250-NEXT: v_pk_min_num_f16 v4, v0, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB126_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13378,10 +13515,11 @@ define void @flat_atomic_fmin_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data ; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_min_num_f16 v0, v0, v3 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13450,6 +13588,7 @@ define <2 x bfloat> @flat_atomic_fadd_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_bf16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -13473,8 +13612,9 @@ define void @flat_atomic_fadd_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_bf16 v1, v0, s[0:1] offset:40 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_nortn: @@ -13504,11 +13644,13 @@ define <2 x bfloat> @flat_atomic_fmax_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo ; GFX1250-NEXT: v_mov_b32_e32 v5, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_bf16 v4, v5, v0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB130_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13560,10 +13702,11 @@ define void @flat_atomic_fmax_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_pk_max_num_bf16 v2, v3, v0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -13618,11 +13761,13 @@ define <2 x bfloat> @flat_atomic_fmin_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo ; GFX1250-NEXT: v_mov_b32_e32 v5, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_min_num_bf16 v4, v5, v0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB132_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end @@ -13674,10 +13819,11 @@ define void @flat_atomic_fmin_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_pk_min_num_bf16 v2, v3, v0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll index 1e293c28ce397..ba761bedb905c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll @@ -38,6 +38,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -79,6 +80,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -189,6 +191,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -255,6 +258,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 3856f0c327495..160b35352d8a4 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1473,6 +1473,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1513,6 +1514,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1557,6 +1561,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1597,6 +1602,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1673,6 +1681,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -1765,6 +1774,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1809,6 +1821,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1849,6 +1862,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1893,6 +1909,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1969,6 +1986,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2063,6 +2081,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2136,6 +2157,7 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2275,6 +2297,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2307,6 +2330,7 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2339,6 +2363,7 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll deleted file mode 100644 index fcdba69c30213..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll +++ /dev/null @@ -1,88 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -mcpu=gfx1250 -mattr=-cu-stores < %s | FileCheck --check-prefixes=GCN,NOCU %s - -; Check that if -cu-stores is used, we use SCOPE_SE minimum on all stores. - -; GCN: flat_store: -; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel flat_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @flat_store(ptr %dst, i32 %val) { -entry: - store i32 %val, ptr %dst - ret void -} - -; GCN: global_store: -; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}} -; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel global_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @global_store(ptr addrspace(1) %dst, i32 %val) { -entry: - store i32 %val, ptr addrspace(1) %dst - ret void -} - -; GCN: local_store: -; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; GCN: .amdhsa_kernel local_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @local_store(ptr addrspace(3) %dst, i32 %val) { -entry: - store i32 %val, ptr addrspace(3) %dst - ret void -} - -; GCN: scratch_store: -; CU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE -; NOCU: scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel scratch_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @scratch_store(ptr addrspace(5) %dst, i32 %val) { -entry: - store i32 %val, ptr addrspace(5) %dst - ret void -} - -; GCN: flat_atomic_store: -; CU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; NOCU: flat_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel flat_atomic_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @flat_atomic_store(ptr %dst, i32 %val) { -entry: - store atomic i32 %val, ptr %dst syncscope("wavefront") unordered, align 4 - ret void -} - -; GCN: global_atomic_store: -; CU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}}{{$}} -; NOCU: global_store_b32 v{{.*}}, v{{.*}}, s{{.*}} scope:SCOPE_SE -; GCN: .amdhsa_kernel global_atomic_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @global_atomic_store(ptr addrspace(1) %dst, i32 %val) { -entry: - store atomic i32 %val, ptr addrspace(1) %dst syncscope("wavefront") unordered, align 4 - ret void -} - -; GCN: local_atomic_store: -; CU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; NOCU: ds_store_b32 v{{.*}}, v{{.*}}{{$}} -; GCN: .amdhsa_kernel local_atomic_store -; CU: .amdhsa_uses_cu_stores 1 -; NOCU: .amdhsa_uses_cu_stores 0 -define amdgpu_kernel void @local_atomic_store(ptr addrspace(3) %dst, i32 %val) { -entry: - store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll index e3ec4d1f0f67a..614a221d43d53 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll @@ -130,6 +130,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -144,6 +145,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -158,6 +160,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -220,6 +223,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -234,6 +238,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -248,6 +253,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -385,6 +391,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -399,6 +406,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -413,6 +421,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -475,6 +484,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -489,6 +499,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -503,6 +514,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll index e86f0e0083805..2b04ab5ab8a00 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll @@ -124,6 +124,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -137,6 +138,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -150,6 +152,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -163,6 +166,7 @@ define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef r ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -176,6 +180,7 @@ define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr nou ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -189,6 +194,7 @@ define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr nou ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -202,6 +208,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -215,6 +222,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -228,6 +236,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -358,6 +367,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -371,6 +381,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -384,6 +395,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -397,6 +409,7 @@ define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr no ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -410,6 +423,7 @@ define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -423,6 +437,7 @@ define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst( ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -436,6 +451,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -449,6 +465,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -462,6 +479,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll index e921f581c00a7..516c3946f63dc 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll @@ -45,7 +45,6 @@ define amdgpu_kernel void @test_s_barrier() { ; ; GFX1250-LABEL: test_s_barrier: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_alu 0xffe3 ; GFX1250-NEXT: s_barrier_signal -1 ; GFX1250-NEXT: s_barrier_wait -1 ; GFX1250-NEXT: s_endpgm @@ -103,8 +102,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() { ; ; GFX1250-LABEL: test_s_barrier_workgroup_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_dscnt 0x0 -; GFX1250-NEXT: s_wait_alu 0xffe3 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_barrier_signal -1 ; GFX1250-NEXT: s_barrier_wait -1 ; GFX1250-NEXT: s_endpgm @@ -168,11 +167,9 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() { ; ; GFX1250-LABEL: test_s_barrier_agent_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: s_wait_alu 0xffe3 ; GFX1250-NEXT: s_barrier_signal -1 ; GFX1250-NEXT: s_barrier_wait -1 ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 97d52d5f1f26d..6a76f4307dcad 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -82,6 +82,8 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX1250-LABEL: workgroup_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} @@ -153,6 +155,8 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX1250-LABEL: workgroup_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"} @@ -229,6 +233,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX1250-LABEL: workgroup_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} @@ -305,6 +311,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX1250-LABEL: workgroup_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} @@ -379,6 +387,8 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX1250-LABEL: workgroup_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} @@ -450,6 +460,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX1250-LABEL: workgroup_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} @@ -526,6 +538,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} @@ -602,6 +616,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} @@ -787,8 +803,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX1250-LABEL: agent_release_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -893,8 +908,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX1250-LABEL: agent_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1000,8 +1014,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX1250-LABEL: agent_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1190,8 +1203,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX1250-LABEL: agent_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1296,8 +1308,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: agent_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1403,8 +1414,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: agent_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1600,8 +1610,6 @@ define amdgpu_kernel void @system_release_fence() { ; GFX1250-LABEL: system_release_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1713,8 +1721,6 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX1250-LABEL: system_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -1827,8 +1833,6 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX1250-LABEL: system_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2024,8 +2028,6 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX1250-LABEL: system_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -2137,8 +2139,6 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX1250-LABEL: system_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2251,8 +2251,6 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX1250-LABEL: system_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index b3f6533d43887..736a8b58466dd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -1066,7 +1066,8 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; ; GFX1250-LABEL: workgroup_acquire_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire @@ -1146,7 +1147,8 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX1250-LABEL: workgroup_release_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release @@ -1231,7 +1233,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX1250-LABEL: workgroup_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel @@ -1316,7 +1319,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX1250-LABEL: workgroup_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst @@ -1391,6 +1395,8 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; ; GFX1250-LABEL: workgroup_one_as_acquire_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire @@ -1462,6 +1468,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; ; GFX1250-LABEL: workgroup_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release @@ -1538,6 +1546,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel @@ -1614,6 +1624,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst @@ -1799,8 +1811,7 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX1250-LABEL: agent_release_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -1905,8 +1916,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX1250-LABEL: agent_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2012,8 +2022,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX1250-LABEL: agent_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2202,8 +2211,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; ; GFX1250-LABEL: agent_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -2308,8 +2316,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; ; GFX1250-LABEL: agent_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2415,8 +2422,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; ; GFX1250-LABEL: agent_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2612,8 +2618,6 @@ define amdgpu_kernel void @system_release_fence() { ; GFX1250-LABEL: system_release_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -2725,8 +2729,6 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX1250-LABEL: system_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2839,8 +2841,6 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX1250-LABEL: system_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -3036,8 +3036,6 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX1250-LABEL: system_one_as_release_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -3149,8 +3147,6 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX1250-LABEL: system_one_as_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -3263,8 +3259,6 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX1250-LABEL: system_one_as_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 36adbc0011118..55ec0c2255f9b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -830,14 +830,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -1000,6 +996,7 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1159,6 +1156,7 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1342,9 +1340,9 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -1529,9 +1527,9 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -1692,6 +1690,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -1882,6 +1881,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2067,9 +2067,9 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -2285,9 +2285,9 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2505,9 +2505,9 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2729,6 +2729,7 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2979,13 +2980,11 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -3235,13 +3234,11 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -3496,6 +3493,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -3779,6 +3777,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4057,9 +4056,9 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -4368,9 +4367,9 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -4681,9 +4680,9 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -4970,6 +4969,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5255,6 +5255,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5564,9 +5565,9 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -5877,9 +5878,9 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6190,9 +6191,9 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6503,9 +6504,9 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6816,9 +6817,9 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -7129,9 +7130,9 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -7442,9 +7443,9 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -7755,9 +7756,9 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -8057,6 +8058,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -8372,6 +8374,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -8697,9 +8700,9 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9044,13 +9047,11 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9394,13 +9395,11 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9720,9 +9719,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10038,6 +10036,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -10382,13 +10381,11 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10732,13 +10729,11 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -11082,13 +11077,11 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -11432,13 +11425,11 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -11778,9 +11769,9 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12126,13 +12117,11 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -12476,13 +12465,11 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -12826,13 +12813,11 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -13684,14 +13669,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -13855,6 +13836,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14014,6 +13996,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14197,10 +14180,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14384,10 +14367,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14547,6 +14530,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -14733,6 +14717,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -14918,10 +14903,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -15132,10 +15117,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -15348,10 +15333,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -15582,6 +15567,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -15843,13 +15829,11 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -16110,13 +16094,11 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -16372,6 +16354,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -16651,6 +16634,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -16929,10 +16913,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -17236,10 +17220,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -17545,10 +17529,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -17830,6 +17814,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18111,6 +18096,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18416,10 +18402,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18725,10 +18711,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19034,10 +19020,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19343,10 +19329,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19652,10 +19638,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19961,10 +19947,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -20270,10 +20256,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -20579,10 +20565,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -20881,6 +20867,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21206,6 +21193,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -21532,10 +21520,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21889,13 +21877,11 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22250,13 +22236,11 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22587,9 +22571,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22916,6 +22899,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -23271,13 +23255,11 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -23632,13 +23614,11 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -23993,13 +23973,11 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -24354,13 +24332,11 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -24711,10 +24687,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -25070,13 +25046,11 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -25431,13 +25405,11 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -25792,13 +25764,11 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index 5526b29037977..faa970e049bd2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -114,8 +114,6 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index 964f1c8957f6f..721ecd8da5387 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -1329,8 +1329,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index 871c941dd6dca..635895259ee32 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -936,6 +936,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1095,6 +1096,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1254,6 +1256,7 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1413,6 +1416,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1572,6 +1576,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -1731,6 +1736,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -1890,6 +1896,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -2049,6 +2056,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -2208,6 +2216,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -2411,6 +2420,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -2617,6 +2627,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -2823,6 +2834,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -3077,6 +3089,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -3329,6 +3342,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -3581,6 +3595,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -3833,6 +3848,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -4085,6 +4101,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -4337,6 +4354,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -4589,6 +4607,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -4841,6 +4860,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -5093,6 +5113,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -5345,6 +5366,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -5597,6 +5619,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -5849,6 +5872,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -6101,6 +6125,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -6353,6 +6378,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -6605,6 +6631,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -6901,6 +6928,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -7201,6 +7229,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -7501,6 +7530,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -7801,6 +7831,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -8101,6 +8132,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -8401,6 +8433,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -8701,6 +8734,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9001,6 +9035,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9301,6 +9336,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9601,6 +9637,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9901,6 +9938,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10201,6 +10239,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10501,6 +10540,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10801,6 +10841,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -11101,6 +11142,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -12037,6 +12079,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12196,6 +12239,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12355,6 +12399,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12514,6 +12559,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12673,6 +12719,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -12832,6 +12879,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -12991,6 +13039,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -13150,6 +13199,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -13309,6 +13359,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -13512,6 +13563,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -13718,6 +13770,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -13924,6 +13977,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -14178,6 +14232,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -14430,6 +14485,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -14682,6 +14738,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -14934,6 +14991,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -15186,6 +15244,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -15438,6 +15497,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -15690,6 +15750,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -15942,6 +16003,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -16194,6 +16256,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -16446,6 +16509,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -16698,6 +16762,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -16950,6 +17015,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -17202,6 +17268,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -17454,6 +17521,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -17706,6 +17774,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -18002,6 +18071,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -18302,6 +18372,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -18602,6 +18673,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -18902,6 +18974,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -19202,6 +19275,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -19502,6 +19576,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -19802,6 +19877,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -20102,6 +20178,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -20402,6 +20479,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -20702,6 +20780,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21002,6 +21081,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21302,6 +21382,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21602,6 +21683,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21902,6 +21984,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -22202,6 +22285,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 9d70a2437e553..e45a8e51c836c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -834,14 +834,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -1004,6 +1000,7 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1163,6 +1160,7 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1351,9 +1349,8 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -1543,9 +1540,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -1706,6 +1702,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -1898,6 +1895,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2088,9 +2086,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -2313,9 +2310,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2540,9 +2536,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2766,6 +2761,7 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -3023,13 +3019,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -3286,13 +3279,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -3547,6 +3537,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -3832,6 +3823,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -4115,9 +4107,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -4433,9 +4424,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -4753,9 +4743,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -5044,6 +5033,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -5331,6 +5321,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -5647,9 +5638,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -5967,9 +5957,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6287,9 +6276,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6607,9 +6595,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6927,9 +6914,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -7247,9 +7233,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -7567,9 +7552,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -7887,9 +7871,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -8189,6 +8172,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -8506,6 +8490,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -8836,9 +8821,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9190,13 +9174,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9547,13 +9528,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9875,9 +9853,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10195,6 +10172,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -10546,13 +10524,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10903,13 +10878,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -11260,13 +11232,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -11617,13 +11586,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -11970,9 +11936,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12325,13 +12290,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -12682,13 +12644,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -13039,13 +12998,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -13901,14 +13857,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -14072,6 +14024,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14231,6 +14184,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14419,10 +14373,9 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14611,10 +14564,9 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -14774,6 +14726,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -14962,6 +14915,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15152,10 +15106,9 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -15373,10 +15326,9 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15596,10 +15548,9 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15832,6 +15783,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16100,13 +16052,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -16374,13 +16323,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -16636,6 +16582,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -16917,6 +16864,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -17200,10 +17148,9 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -17514,10 +17461,9 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -17830,10 +17776,9 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -18117,6 +18062,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -18400,6 +18346,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -18712,10 +18659,9 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -19028,10 +18974,9 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -19344,10 +19289,9 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -19660,10 +19604,9 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -19976,10 +19919,9 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -20292,10 +20234,9 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -20608,10 +20549,9 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -20924,10 +20864,9 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -21226,6 +21165,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21553,6 +21493,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -21884,10 +21825,9 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -22248,13 +22188,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22616,13 +22553,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22955,9 +22889,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -23286,6 +23219,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -23648,13 +23582,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -24016,13 +23947,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -24384,13 +24312,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -24752,13 +24677,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -25116,10 +25038,9 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -25482,13 +25403,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -25850,13 +25768,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -26218,13 +26133,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index 77f52e4d4b9fd..41c5927cad4de 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -152,8 +152,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm @@ -433,8 +431,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm @@ -1151,7 +1147,9 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index f086542b3d1f8..041b3f51abc2f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -936,6 +936,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1095,6 +1096,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1254,6 +1256,7 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1413,6 +1416,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1572,6 +1576,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -1731,6 +1736,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -1890,6 +1896,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -2049,6 +2056,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -2208,6 +2216,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -2411,6 +2420,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -2617,6 +2627,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -2823,6 +2834,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -3077,6 +3089,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -3329,6 +3342,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -3581,6 +3595,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -3833,6 +3848,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -4085,6 +4101,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -4337,6 +4354,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -4589,6 +4607,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -4841,6 +4860,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -5093,6 +5113,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -5345,6 +5366,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -5597,6 +5619,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -5849,6 +5872,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -6101,6 +6125,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -6353,6 +6378,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -6605,6 +6631,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -6901,6 +6928,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -7201,6 +7229,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -7501,6 +7530,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -7801,6 +7831,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -8101,6 +8132,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -8401,6 +8433,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -8701,6 +8734,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9001,6 +9035,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9301,6 +9336,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9601,6 +9637,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9901,6 +9938,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10201,6 +10239,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10501,6 +10540,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10801,6 +10841,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -11101,6 +11142,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -12037,6 +12079,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12196,6 +12239,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12355,6 +12399,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12514,6 +12559,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12673,6 +12719,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -12832,6 +12879,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -12991,6 +13039,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -13150,6 +13199,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -13309,6 +13359,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -13512,6 +13563,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -13718,6 +13770,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -13924,6 +13977,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -14178,6 +14232,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -14430,6 +14485,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -14682,6 +14738,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -14934,6 +14991,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -15186,6 +15244,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -15438,6 +15497,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -15690,6 +15750,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -15942,6 +16003,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -16194,6 +16256,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -16446,6 +16509,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -16698,6 +16762,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -16950,6 +17015,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -17202,6 +17268,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -17454,6 +17521,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -17706,6 +17774,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -18002,6 +18071,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -18302,6 +18372,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -18602,6 +18673,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -18902,6 +18974,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -19202,6 +19275,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -19502,6 +19576,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -19802,6 +19877,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -20102,6 +20178,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -20402,6 +20479,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -20702,6 +20780,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21002,6 +21081,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21302,6 +21382,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21602,6 +21683,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21902,6 +21984,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index d8e6ad043e061..85ecab8128d2f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -816,7 +816,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -980,6 +981,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1139,6 +1141,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1315,7 +1318,9 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1492,7 +1497,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -1652,6 +1659,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -1830,8 +1838,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -2007,7 +2016,9 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -2203,9 +2214,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -2400,9 +2413,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -2617,6 +2632,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -2854,7 +2870,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -3092,7 +3110,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -3347,6 +3367,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -3618,8 +3639,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -3888,7 +3910,9 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -4177,9 +4201,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -4467,9 +4493,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -4740,8 +4768,9 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -5012,8 +5041,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -5301,9 +5331,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -5591,9 +5623,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -5881,9 +5915,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -6171,9 +6207,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -6469,6 +6507,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -6781,6 +6820,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -7098,7 +7138,9 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -7430,7 +7472,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -7762,7 +7806,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -8077,6 +8123,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -8389,6 +8436,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -8720,7 +8768,9 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9052,7 +9102,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9384,7 +9436,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -9716,7 +9770,9 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10046,7 +10102,9 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10378,7 +10436,9 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -10710,7 +10770,9 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -11042,7 +11104,9 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -11844,6 +11908,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12007,6 +12073,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12166,6 +12233,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12335,6 +12403,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12504,6 +12575,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { @@ -12663,6 +12737,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -12832,7 +12907,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -13001,6 +13078,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { @@ -13180,7 +13260,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -13359,7 +13443,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: @@ -13570,6 +13658,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -13796,6 +13885,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -14022,6 +14114,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -14276,6 +14371,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -14538,7 +14634,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -14800,6 +14898,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { @@ -15072,7 +15173,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -15344,7 +15449,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -15606,7 +15715,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -15868,7 +15979,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -16140,7 +16253,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -16412,7 +16529,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -16684,7 +16805,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -16956,7 +17081,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -17228,7 +17357,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -17500,7 +17633,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -17772,7 +17909,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -18044,7 +18185,11 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: @@ -18340,6 +18485,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -18648,6 +18794,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -18958,6 +19105,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -19278,6 +19428,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -19598,6 +19751,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -19908,6 +20064,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -20216,6 +20373,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -20536,6 +20694,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -20856,6 +21017,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21176,6 +21340,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21496,6 +21663,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -21814,6 +21984,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -22134,6 +22307,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -22454,6 +22630,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE @@ -22774,6 +22953,9 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 184e15406bfbc..5c2d8eb4f5ec0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -834,14 +834,10 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1011,6 +1007,7 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1177,6 +1174,7 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1368,9 +1366,9 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -1563,9 +1561,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -1731,6 +1729,7 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1924,6 +1923,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2115,9 +2115,9 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -2337,9 +2337,9 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2561,9 +2561,9 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2775,6 +2775,7 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3016,13 +3017,11 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3263,13 +3262,11 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3505,6 +3502,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3767,6 +3765,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -4027,9 +4026,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -4318,9 +4317,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -4611,9 +4610,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -4879,6 +4878,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5143,6 +5143,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5432,9 +5433,9 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -5725,9 +5726,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -6018,9 +6019,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -6311,9 +6312,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -6604,9 +6605,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -6897,9 +6898,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -7190,9 +7191,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -7483,9 +7484,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -7752,6 +7753,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8035,6 +8037,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -8328,9 +8331,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -8644,13 +8647,11 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8963,13 +8964,11 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9257,9 +9256,8 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9543,6 +9541,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -9856,13 +9855,11 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10175,13 +10172,11 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10494,13 +10489,11 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10813,13 +10806,11 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11128,9 +11119,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -11445,13 +11436,11 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11764,13 +11753,11 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12083,13 +12070,11 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -12923,14 +12908,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13100,6 +13081,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13266,6 +13248,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13457,10 +13440,10 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13652,10 +13635,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -13820,6 +13803,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -14013,6 +13997,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -14204,10 +14189,10 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -14426,10 +14411,10 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -14650,10 +14635,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -14864,6 +14849,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -15105,13 +15091,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -15352,13 +15336,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -15594,6 +15576,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15856,6 +15839,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -16116,10 +16100,10 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -16407,10 +16391,10 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -16700,10 +16684,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -16968,6 +16952,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -17232,6 +17217,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -17521,10 +17507,10 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -17814,10 +17800,10 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18107,10 +18093,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18400,10 +18386,10 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18693,10 +18679,10 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18986,10 +18972,10 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19279,10 +19265,10 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19572,10 +19558,10 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19841,6 +19827,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20124,6 +20111,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -20437,13 +20425,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20756,13 +20742,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21050,9 +21034,8 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21336,6 +21319,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -21649,13 +21633,11 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21968,13 +21950,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22287,13 +22267,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22606,13 +22584,11 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22921,10 +22897,10 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -23238,13 +23214,11 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23557,13 +23531,11 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23876,13 +23848,11 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index ed2d62356f8f2..ca7802d295e0b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -94,8 +94,6 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index c1bfe21865c15..d74c230488ea2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -1112,8 +1112,6 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 6a5a6e01c741b..e7f7b1d196be7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -952,6 +952,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1118,6 +1119,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1284,6 +1286,7 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1450,6 +1453,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1614,6 +1618,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1778,6 +1783,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1942,6 +1948,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2106,6 +2113,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2270,6 +2278,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2462,6 +2471,7 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2657,6 +2667,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2852,6 +2863,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3087,6 +3099,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3320,6 +3333,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3553,6 +3567,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3786,6 +3801,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4019,6 +4035,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4252,6 +4269,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4485,6 +4503,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4718,6 +4737,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4951,6 +4971,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5184,6 +5205,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5417,6 +5439,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5650,6 +5673,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5883,6 +5907,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6116,6 +6141,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6349,6 +6375,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6612,6 +6639,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -6879,6 +6907,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7146,6 +7175,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7413,6 +7443,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7680,6 +7711,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7947,6 +7979,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8214,6 +8247,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8481,6 +8515,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8748,6 +8783,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9015,6 +9051,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9282,6 +9319,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9549,6 +9587,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9816,6 +9855,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10083,6 +10123,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10350,6 +10391,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11301,6 +11343,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11467,6 +11510,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11633,6 +11677,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11799,6 +11844,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11963,6 +12009,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -12127,6 +12174,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -12291,6 +12339,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -12455,6 +12504,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -12619,6 +12669,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -12811,6 +12862,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13006,6 +13058,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13201,6 +13254,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13436,6 +13490,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13669,6 +13724,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13902,6 +13958,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14135,6 +14192,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14368,6 +14426,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14601,6 +14660,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14834,6 +14894,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15067,6 +15128,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15300,6 +15362,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15533,6 +15596,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15766,6 +15830,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15999,6 +16064,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -16232,6 +16298,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -16465,6 +16532,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -16698,6 +16766,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -16961,6 +17030,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17228,6 +17298,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17495,6 +17566,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17762,6 +17834,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18029,6 +18102,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18296,6 +18370,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18563,6 +18638,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18830,6 +18906,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19097,6 +19174,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19364,6 +19442,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19631,6 +19710,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19898,6 +19978,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20165,6 +20246,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20432,6 +20514,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20699,6 +20782,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index 7ddd515830e11..e7880a81800fd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -838,14 +838,10 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1015,6 +1011,7 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1181,6 +1178,7 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1377,9 +1375,8 @@ define amdgpu_kernel void @global_system_release_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -1577,9 +1574,8 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -1745,6 +1741,7 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1940,6 +1937,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2136,9 +2134,8 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -2365,9 +2362,8 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2596,9 +2592,8 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2812,6 +2807,7 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -3060,13 +3056,10 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3314,13 +3307,10 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3556,6 +3546,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3820,6 +3811,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -4085,9 +4077,8 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -4383,9 +4374,8 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -4683,9 +4673,8 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -4953,6 +4942,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -5219,6 +5209,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -5515,9 +5506,8 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -5815,9 +5805,8 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -6115,9 +6104,8 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -6415,9 +6403,8 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -6684,6 +6671,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -6969,6 +6957,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -7289,13 +7278,10 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7615,13 +7601,10 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7911,9 +7894,8 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8199,6 +8181,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -8519,13 +8502,10 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8845,13 +8825,10 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9171,13 +9148,10 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9497,13 +9471,10 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9819,9 +9790,8 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -10143,13 +10113,10 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10469,13 +10436,10 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10795,13 +10759,10 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11639,14 +11600,10 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11816,6 +11773,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11982,6 +11940,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12178,10 +12137,9 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12378,10 +12336,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12546,6 +12503,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -12741,6 +12699,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -12937,10 +12896,9 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -13166,10 +13124,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -13397,10 +13354,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -13613,6 +13569,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -13861,13 +13818,10 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14115,13 +14069,10 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14357,6 +14308,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14621,6 +14573,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -14886,10 +14839,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15184,10 +15136,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15484,10 +15435,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15754,6 +15704,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16020,6 +15971,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16316,10 +16268,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16616,10 +16567,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16916,10 +16866,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -17216,10 +17165,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -17516,10 +17464,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -17816,10 +17763,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -18116,10 +18062,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -18416,10 +18361,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -18685,6 +18629,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18970,6 +18915,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -19268,10 +19214,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19591,13 +19536,10 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19917,13 +19859,10 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20213,9 +20152,8 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20501,6 +20439,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -20821,13 +20760,10 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21147,13 +21083,10 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21473,13 +21406,10 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21799,13 +21729,10 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22121,10 +22048,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -22445,13 +22371,10 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22771,13 +22694,10 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -23097,13 +23017,10 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 1539fb574c0bd..3bf5ed8b2397f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -155,8 +155,6 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm @@ -368,8 +366,6 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm @@ -1041,7 +1037,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 1aa8305b1a837..09eb062d876f6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -952,6 +952,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1118,6 +1119,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1284,6 +1286,7 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1450,6 +1453,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1614,6 +1618,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1778,6 +1783,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1942,6 +1948,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2106,6 +2113,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2270,6 +2278,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2462,6 +2471,7 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2657,6 +2667,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2852,6 +2863,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3087,6 +3099,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3320,6 +3333,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3553,6 +3567,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3786,6 +3801,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4019,6 +4035,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4252,6 +4269,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4485,6 +4503,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4718,6 +4737,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4951,6 +4971,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5184,6 +5205,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5417,6 +5439,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5650,6 +5673,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5883,6 +5907,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6116,6 +6141,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6349,6 +6375,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6612,6 +6639,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -6879,6 +6907,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7146,6 +7175,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7413,6 +7443,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7680,6 +7711,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7947,6 +7979,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8214,6 +8247,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8481,6 +8515,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8748,6 +8783,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9015,6 +9051,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9282,6 +9319,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9549,6 +9587,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9816,6 +9855,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10083,6 +10123,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10350,6 +10391,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11301,6 +11343,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11467,6 +11510,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11633,6 +11677,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11799,6 +11844,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -11963,6 +12009,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -12127,6 +12174,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -12291,6 +12339,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -12455,6 +12504,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -12619,6 +12669,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -12811,6 +12862,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13006,6 +13058,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13201,6 +13254,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13436,6 +13490,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13669,6 +13724,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13902,6 +13958,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14135,6 +14192,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14368,6 +14426,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14601,6 +14660,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14834,6 +14894,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15067,6 +15128,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15300,6 +15362,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15533,6 +15596,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15766,6 +15830,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15999,6 +16064,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -16232,6 +16298,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -16465,6 +16532,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -16698,6 +16766,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -16961,6 +17030,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17228,6 +17298,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17495,6 +17566,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -17762,6 +17834,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18029,6 +18102,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18296,6 +18370,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18563,6 +18638,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18830,6 +18906,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19097,6 +19174,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19364,6 +19442,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19631,6 +19710,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19898,6 +19978,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20165,6 +20246,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20432,6 +20514,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20699,6 +20782,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 3eab16e6b9713..885edec03c2b6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -804,7 +804,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -975,6 +976,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1141,6 +1143,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1325,7 +1328,9 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1510,7 +1515,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1675,6 +1682,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1849,7 +1857,9 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -2031,7 +2041,9 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2224,8 +2236,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -2417,8 +2432,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -2615,6 +2633,7 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2835,7 +2854,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3056,7 +3077,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -3292,6 +3315,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3535,7 +3559,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -3786,7 +3812,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4048,8 +4076,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -4310,8 +4341,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -4554,7 +4588,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -4797,7 +4833,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -5058,8 +5096,11 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -5320,8 +5361,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -5582,8 +5626,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -5844,8 +5891,11 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -6106,8 +6156,11 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -6368,8 +6421,11 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -6630,8 +6686,11 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -6892,8 +6951,11 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -7156,6 +7218,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7428,6 +7491,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7713,7 +7777,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8006,7 +8072,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8299,7 +8367,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8574,6 +8644,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8846,6 +8917,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9138,7 +9210,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9431,7 +9505,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9724,7 +9800,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10017,7 +10095,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10308,7 +10388,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10601,7 +10683,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -10894,7 +10978,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11187,7 +11273,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -11987,6 +12075,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -12157,6 +12247,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12323,6 +12414,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12499,6 +12591,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12675,6 +12770,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -12839,6 +12937,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -13013,7 +13112,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -13187,6 +13288,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -13371,7 +13475,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -13555,7 +13663,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: @@ -13752,6 +13864,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -13964,6 +14077,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14176,6 +14292,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -14411,6 +14530,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14654,7 +14774,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -14897,6 +15019,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -15150,7 +15275,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -15403,7 +15532,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -15646,7 +15779,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -15889,7 +16024,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -16142,7 +16279,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -16395,7 +16536,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -16648,7 +16793,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -16901,7 +17050,11 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -17154,7 +17307,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -17407,7 +17564,11 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -17660,7 +17821,11 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -17913,7 +18078,11 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: @@ -18176,6 +18345,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18448,6 +18618,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -18725,6 +18896,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19009,6 +19183,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19293,6 +19470,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19567,6 +19747,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -19839,6 +20020,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20123,6 +20305,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20407,6 +20592,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20691,6 +20879,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -20975,6 +21166,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21257,6 +21451,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21541,6 +21738,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -21825,6 +22025,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] @@ -22109,6 +22312,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 102616b9a2065..986b48b60a443 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -762,7 +762,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -1235,7 +1236,8 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { @@ -1404,7 +1406,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { @@ -1890,7 +1893,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { @@ -2075,7 +2079,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -2261,7 +2266,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -2679,7 +2685,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2899,7 +2906,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -3475,7 +3483,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { @@ -3689,7 +3698,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -3904,7 +3914,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -4511,7 +4522,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -4726,7 +4738,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -4941,7 +4954,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5156,7 +5170,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5371,7 +5386,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5586,7 +5602,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5801,7 +5818,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -6016,7 +6034,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -6714,7 +6733,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -6964,7 +6984,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -7214,7 +7235,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -7926,7 +7948,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8176,7 +8199,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8426,7 +8450,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8676,7 +8701,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8926,7 +8952,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -9176,7 +9203,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -9426,7 +9454,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -9676,7 +9705,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 1356fe4854170..81bbe0a78203e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -762,7 +762,8 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -1235,7 +1236,8 @@ define amdgpu_kernel void @local_system_release_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { @@ -1404,7 +1406,8 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { @@ -1890,7 +1893,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { @@ -2075,7 +2079,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -2261,7 +2266,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -2679,7 +2685,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2899,7 +2906,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -3475,7 +3483,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { @@ -3689,7 +3698,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -3904,7 +3914,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -4511,7 +4522,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -4726,7 +4738,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -4941,7 +4954,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5156,7 +5170,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5371,7 +5386,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5586,7 +5602,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5801,7 +5818,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -6016,7 +6034,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -6714,7 +6733,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -6964,7 +6984,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -7214,7 +7235,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -7926,7 +7948,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8176,7 +8199,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8426,7 +8450,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8676,7 +8701,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8926,7 +8952,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -9176,7 +9203,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -9426,7 +9454,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -9676,7 +9705,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 75e28f9008e28..980141a87ecf3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -890,7 +890,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 6aaf9d323b1fd..6a233a2c9013b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -762,7 +762,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_load_b32 v1, v0 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -1235,7 +1236,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { @@ -1404,7 +1406,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_store_b32 v0, v1 ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { @@ -1890,7 +1893,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { @@ -2075,7 +2079,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -2261,7 +2266,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -2679,7 +2685,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -2899,7 +2906,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -3475,7 +3483,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { @@ -3689,7 +3698,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -3904,7 +3914,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -4511,7 +4522,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -4726,7 +4738,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -4941,7 +4954,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5156,7 +5170,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5371,7 +5386,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5586,7 +5602,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -5801,7 +5818,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -6016,7 +6034,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm @@ -6714,7 +6733,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -6964,7 +6984,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -7214,7 +7235,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -7926,7 +7948,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8176,7 +8199,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8426,7 +8450,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8676,7 +8701,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -8926,7 +8952,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -9176,7 +9203,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -9426,7 +9454,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 @@ -9676,7 +9705,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll index 4ca0cc92e09be..8ac3414da7354 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll @@ -804,13 +804,9 @@ define amdgpu_kernel void @private_agent_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -992,6 +988,7 @@ define amdgpu_kernel void @private_agent_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1170,6 +1167,7 @@ define amdgpu_kernel void @private_agent_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1348,9 +1346,9 @@ define amdgpu_kernel void @private_agent_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -1530,9 +1528,9 @@ define amdgpu_kernel void @private_agent_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -1712,6 +1710,7 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -1890,6 +1889,7 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2070,9 +2070,9 @@ define amdgpu_kernel void @private_agent_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -2252,9 +2252,9 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2436,9 +2436,9 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2674,6 +2674,7 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2910,13 +2911,11 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3152,13 +3151,11 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3430,6 +3427,7 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3699,6 +3697,7 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -3970,9 +3969,9 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm @@ -4243,9 +4242,9 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -4518,9 +4517,9 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -4793,6 +4792,7 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5064,6 +5064,7 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -5335,9 +5336,9 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -5610,9 +5611,9 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -5885,9 +5886,9 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6160,9 +6161,9 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6435,9 +6436,9 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6710,9 +6711,9 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6985,9 +6986,9 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -7260,9 +7261,9 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -7563,6 +7564,7 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7864,6 +7866,7 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -8166,9 +8169,9 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -8471,13 +8474,11 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8779,13 +8780,11 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9087,9 +9086,8 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9391,6 +9389,7 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -9693,13 +9692,11 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10001,13 +9998,11 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10309,13 +10304,11 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10617,13 +10610,11 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10925,9 +10916,9 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -11231,13 +11222,11 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11539,13 +11528,11 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11847,13 +11834,11 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -12658,13 +12643,9 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -12847,6 +12828,7 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13025,6 +13007,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13203,10 +13186,10 @@ define amdgpu_kernel void @private_agent_one_as_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13385,10 +13368,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13567,6 +13550,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13745,6 +13729,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -13925,10 +13910,10 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -14107,10 +14092,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -14291,10 +14276,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -14529,6 +14514,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -14766,13 +14752,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -15009,13 +14993,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -15288,6 +15270,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15557,6 +15540,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -15828,10 +15812,10 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16101,10 +16085,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -16376,10 +16360,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -16651,6 +16635,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -16922,6 +16907,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -17193,10 +17179,10 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -17468,10 +17454,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -17743,10 +17729,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18018,10 +18004,10 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18293,10 +18279,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18568,10 +18554,10 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -18843,10 +18829,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19118,10 +19104,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -19421,6 +19407,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19722,6 +19709,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -20025,13 +20013,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -20334,13 +20320,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -20643,9 +20627,8 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -20948,6 +20931,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -21251,13 +21235,11 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -21560,13 +21542,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -21869,13 +21849,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22178,13 +22156,11 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22487,10 +22463,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -22794,13 +22770,11 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -23103,13 +23077,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -23412,13 +23384,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll index e9ee6b4925a13..f5ba70e454823 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll @@ -984,6 +984,7 @@ define amdgpu_kernel void @private_singlethread_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1162,6 +1163,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1340,6 +1342,7 @@ define amdgpu_kernel void @private_singlethread_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1518,6 +1521,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1696,6 +1700,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -1874,6 +1879,7 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2052,6 +2058,7 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2230,6 +2237,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2408,6 +2416,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2640,6 +2649,7 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -2875,6 +2885,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3110,6 +3121,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3381,6 +3393,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3650,6 +3663,7 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3919,6 +3933,7 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4188,6 +4203,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4457,6 +4473,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4726,6 +4743,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4995,6 +5013,7 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5264,6 +5283,7 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5533,6 +5553,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5802,6 +5823,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6071,6 +6093,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6340,6 +6363,7 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6609,6 +6633,7 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6878,6 +6903,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -7147,6 +7173,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -7444,6 +7471,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7745,6 +7773,7 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8046,6 +8075,7 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8347,6 +8377,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8648,6 +8679,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8949,6 +8981,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9250,6 +9283,7 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9551,6 +9585,7 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9852,6 +9887,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10153,6 +10189,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10454,6 +10491,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10755,6 +10793,7 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11056,6 +11095,7 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11357,6 +11397,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11658,6 +11699,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -12641,6 +12683,7 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12819,6 +12862,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12997,6 +13041,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13175,6 +13220,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13353,6 +13399,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13531,6 +13578,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13709,6 +13757,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13887,6 +13936,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -14065,6 +14115,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -14297,6 +14348,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14532,6 +14584,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14767,6 +14820,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -15038,6 +15092,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15307,6 +15362,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15576,6 +15632,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15845,6 +15902,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16114,6 +16172,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16383,6 +16442,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16652,6 +16712,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16921,6 +16982,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17190,6 +17252,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17459,6 +17522,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17728,6 +17792,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17997,6 +18062,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18266,6 +18332,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18535,6 +18602,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18804,6 +18872,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -19101,6 +19170,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19402,6 +19472,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19703,6 +19774,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20004,6 +20076,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20305,6 +20378,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20606,6 +20680,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20907,6 +20982,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21208,6 +21284,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21509,6 +21586,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21810,6 +21888,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22111,6 +22190,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22412,6 +22492,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22713,6 +22794,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23014,6 +23096,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23315,6 +23398,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll index 24ec3a34c4e6e..1e2153f76bc03 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll @@ -804,13 +804,9 @@ define amdgpu_kernel void @private_system_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -992,6 +988,7 @@ define amdgpu_kernel void @private_system_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1170,6 +1167,7 @@ define amdgpu_kernel void @private_system_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1349,9 +1347,8 @@ define amdgpu_kernel void @private_system_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -1532,9 +1529,8 @@ define amdgpu_kernel void @private_system_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -1714,6 +1710,7 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -1892,6 +1889,7 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2073,9 +2071,8 @@ define amdgpu_kernel void @private_system_release_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -2256,9 +2253,8 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2441,9 +2437,8 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2679,6 +2674,7 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -2916,13 +2912,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3159,13 +3152,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3437,6 +3427,7 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3706,6 +3697,7 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -3978,9 +3970,8 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm @@ -4252,9 +4243,8 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -4528,9 +4518,8 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -4803,6 +4792,7 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -5074,6 +5064,7 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -5346,9 +5337,8 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -5622,9 +5612,8 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -5898,9 +5887,8 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6174,9 +6162,8 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -6477,6 +6464,7 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -6778,6 +6766,7 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -7081,13 +7070,10 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7390,13 +7376,10 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7698,9 +7681,8 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8002,6 +7984,7 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -8305,13 +8288,10 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8614,13 +8594,10 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8923,13 +8900,10 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9232,13 +9206,10 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9541,9 +9512,8 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -9848,13 +9818,10 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10157,13 +10124,10 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10466,13 +10430,10 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11277,13 +11238,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -11466,6 +11423,7 @@ define amdgpu_kernel void @private_system_one_as_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -11644,6 +11602,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -11823,10 +11782,9 @@ define amdgpu_kernel void @private_system_one_as_release_store( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12006,10 +11964,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12188,6 +12145,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -12366,6 +12324,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -12547,10 +12506,9 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -12730,10 +12688,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -12915,10 +12872,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -13153,6 +13109,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -13391,13 +13348,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -13635,13 +13589,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -13914,6 +13865,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -14183,6 +14135,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -14455,10 +14408,9 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -14729,10 +14681,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15005,10 +14956,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15280,6 +15230,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15551,6 +15502,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -15823,10 +15775,9 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16099,10 +16050,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16375,10 +16325,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16651,10 +16600,9 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -16927,10 +16875,9 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -17203,10 +17150,9 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -17479,10 +17425,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -17755,10 +17700,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -18058,6 +18002,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -18359,6 +18304,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -18663,10 +18609,9 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -18969,13 +18914,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -19279,13 +19221,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -19588,9 +19527,8 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -19893,6 +19831,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -20197,13 +20136,10 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -20507,13 +20443,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -20817,13 +20750,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -21127,13 +21057,10 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -21437,10 +21364,9 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS @@ -21745,13 +21671,10 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22055,13 +21978,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 @@ -22365,13 +22285,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: s_wait_bvhcnt 0x0 -; GFX1250-NEXT: s_wait_samplecnt 0x0 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll index 8b2254412c0c8..28d9d5dacd9e3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll @@ -984,6 +984,7 @@ define amdgpu_kernel void @private_wavefront_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1162,6 +1163,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1340,6 +1342,7 @@ define amdgpu_kernel void @private_wavefront_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1518,6 +1521,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1696,6 +1700,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -1874,6 +1879,7 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2052,6 +2058,7 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2230,6 +2237,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2408,6 +2416,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2640,6 +2649,7 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -2875,6 +2885,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3110,6 +3121,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3381,6 +3393,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3650,6 +3663,7 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3919,6 +3933,7 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4188,6 +4203,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4457,6 +4473,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4726,6 +4743,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4995,6 +5013,7 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5264,6 +5283,7 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5533,6 +5553,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -5802,6 +5823,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6071,6 +6093,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6340,6 +6363,7 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6609,6 +6633,7 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -6878,6 +6903,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -7147,6 +7173,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -7444,6 +7471,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7745,6 +7773,7 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8046,6 +8075,7 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8347,6 +8377,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8648,6 +8679,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8949,6 +8981,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9250,6 +9283,7 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9551,6 +9585,7 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9852,6 +9887,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10153,6 +10189,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10454,6 +10491,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10755,6 +10793,7 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11056,6 +11095,7 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11357,6 +11397,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11658,6 +11699,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -12641,6 +12683,7 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12819,6 +12862,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12997,6 +13041,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13175,6 +13220,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13353,6 +13399,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13531,6 +13578,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13709,6 +13757,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13887,6 +13936,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -14065,6 +14115,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -14297,6 +14348,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14532,6 +14584,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14767,6 +14820,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -15038,6 +15092,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15307,6 +15362,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15576,6 +15632,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15845,6 +15902,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16114,6 +16172,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16383,6 +16442,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16652,6 +16712,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -16921,6 +16982,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17190,6 +17252,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17459,6 +17522,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17728,6 +17792,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -17997,6 +18062,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18266,6 +18332,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18535,6 +18602,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -18804,6 +18872,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -19101,6 +19170,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19402,6 +19472,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19703,6 +19774,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20004,6 +20076,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20305,6 +20378,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20606,6 +20680,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20907,6 +20982,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21208,6 +21284,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21509,6 +21586,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21810,6 +21888,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22111,6 +22190,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22412,6 +22492,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22713,6 +22794,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23014,6 +23096,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23315,6 +23398,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll index 127434c365f95..01b2f6835cf7b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll @@ -803,7 +803,8 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -985,6 +986,7 @@ define amdgpu_kernel void @private_workgroup_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1163,6 +1165,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1341,7 +1344,9 @@ define amdgpu_kernel void @private_workgroup_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1520,7 +1525,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -1699,6 +1706,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -1877,8 +1885,9 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -2056,7 +2065,9 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -2235,9 +2246,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -2415,9 +2428,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -2649,6 +2664,7 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -2884,7 +2900,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3120,7 +3138,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -3392,6 +3412,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -3661,8 +3682,9 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -3931,7 +3953,9 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -4201,9 +4225,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -4472,9 +4498,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -4743,8 +4771,9 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -5013,8 +5042,9 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -5283,9 +5313,11 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -5554,9 +5586,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -5825,9 +5859,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -6096,9 +6132,11 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -6367,9 +6405,11 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -6638,9 +6678,11 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -6909,9 +6951,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -7180,9 +7224,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -7479,6 +7525,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -7780,6 +7827,7 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8081,7 +8129,9 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8383,7 +8433,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8685,7 +8737,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -8987,6 +9041,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9288,6 +9343,7 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9589,7 +9645,9 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -9891,7 +9949,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10193,7 +10253,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10495,7 +10557,9 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -10797,7 +10861,9 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11099,7 +11165,9 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11401,7 +11469,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -11703,7 +11773,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -12506,6 +12578,8 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load( ; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2 ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -12687,6 +12761,7 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -12865,6 +12940,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13043,6 +13119,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13221,6 +13300,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(5) %out) { @@ -13399,6 +13481,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13577,7 +13660,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -13755,6 +13840,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { @@ -13933,7 +14021,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -14111,7 +14203,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in) { entry: @@ -14343,6 +14439,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14578,6 +14675,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -14813,6 +14913,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v1, v2 ; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -15084,6 +15187,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15353,7 +15457,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -15622,6 +15728,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { @@ -15891,7 +16000,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -16160,7 +16273,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -16429,7 +16546,9 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -16698,7 +16817,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -16967,7 +17088,11 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -17236,7 +17361,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -17505,7 +17634,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -17774,7 +17907,11 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -18043,7 +18180,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -18312,7 +18453,11 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -18581,7 +18726,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -18850,7 +18999,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s0 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_endpgm ptr addrspace(5) %out, i32 %in, i32 %old) { entry: @@ -19147,6 +19300,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19448,6 +19602,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -19749,6 +19904,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20050,6 +20208,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20351,6 +20512,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20652,6 +20816,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -20953,6 +21118,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21254,6 +21420,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21555,6 +21724,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -21856,6 +22028,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22157,6 +22332,9 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22458,6 +22636,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -22759,6 +22940,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23060,6 +23244,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE @@ -23361,6 +23548,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: v_mov_b32_e32 v4, s1 ; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX1250-NEXT: v_mov_b32_e32 v3, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s index e19be77e901fa..3c693610bee51 100644 --- a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s +++ b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s @@ -40,7 +40,7 @@ // OBJDUMP-NEXT: 0040 01000000 01000000 0c000000 00000000 // OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00c00000 -// OBJDUMP-NEXT: 0070 005021c4 410f007f 5e068200 00000000 +// OBJDUMP-NEXT: 0070 005021c4 410f007f 5e048200 00000000 // special_sgpr // OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000 // OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000 @@ -127,7 +127,6 @@ max_vgprs: .amdhsa_user_sgpr_kernarg_preload_length 2 .amdhsa_user_sgpr_kernarg_preload_offset 1 .amdhsa_user_sgpr_private_segment_size 1 - .amdhsa_uses_cu_stores 1 .amdhsa_wavefront_size32 1 .amdhsa_enable_private_segment 1 .amdhsa_system_sgpr_workgroup_id_x 0 @@ -168,7 +167,6 @@ max_vgprs: // ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 2 // ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 1 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 -// ASM-NEXT: .amdhsa_uses_cu_stores 1 // ASM-NEXT: .amdhsa_wavefront_size32 1 // ASM-NEXT: .amdhsa_enable_private_segment 1 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 diff --git a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test index 369005f4ea432..fdca11b95caa6 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test +++ b/llvm/test/MC/Disassembler/AMDGPU/kernel-descriptor-errors.test @@ -13,10 +13,10 @@ # RES_4_2: ; error decoding test.kd: kernel descriptor reserved bits in range (511:480) set # RES_4_2-NEXT: ; decoding failed region as bytes -# RUN: yaml2obj %s -DGPU=GFX90A -DKD=00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003000000000000 \ -# RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RES_456 -# RES_456: ; error decoding test.kd: kernel descriptor reserved bits in range (456:455) set -# RES_456-NEXT: ; decoding failed region as bytes +# RUN: yaml2obj %s -DGPU=GFX90A -DKD=00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000006000000000000 \ +# RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=RES_457 +# RES_457: ; error decoding test.kd: kernel descriptor reserved bits in range (457:455) set +# RES_457-NEXT: ; decoding failed region as bytes # RUN: yaml2obj %s -DGPU=GFX90A -DKD=0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000c000000000000 \ # RUN: | llvm-objdump --disassemble-symbols=test.kd - | FileCheck %s --check-prefix=WF32 diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s index 99a4df3e5adfb..3e96ea3c67380 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s @@ -49,7 +49,6 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; CHECK-NEXT: .amdhsa_uses_cu_stores 1 ; CHECK-NEXT: .amdhsa_wavefront_size32 1 ; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0 ; CHECK-NEXT: .end_amdhsa_kernel @@ -57,7 +56,6 @@ .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .amdhsa_inst_pref_size 0 - .amdhsa_uses_cu_stores 1 .end_amdhsa_kernel ;--- 2.s @@ -107,7 +105,6 @@ ; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -; CHECK-NEXT: .amdhsa_uses_cu_stores 0 ; CHECK-NEXT: .amdhsa_wavefront_size32 1 ; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0 ; CHECK-NEXT: .end_amdhsa_kernel @@ -116,6 +113,5 @@ .amdhsa_next_free_vgpr 32 .amdhsa_next_free_sgpr 32 .amdhsa_named_barrier_count 7 - .amdhsa_uses_cu_stores 0 .amdhsa_inst_pref_size 63 .end_amdhsa_kernel