diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 927826c52404bf..ef662d55cb0a9d 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -42,7 +42,9 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" @@ -372,6 +374,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { AMDGPU::IsaVersion IV; DenseSet TrackedWaitcntSet; + DenseMap SLoadAddresses; + MachinePostDominatorTree *PDT; struct BlockInfo { MachineBasicBlock *MBB; @@ -406,6 +410,7 @@ class SIInsertWaitcnts : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -792,6 +797,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) @@ -1012,6 +1018,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore( if (MI.mayStore()) { // FIXME: Should not be relying on memoperands. for (const MachineMemOperand *Memop : MI.memoperands()) { + const Value *Ptr = Memop->getValue(); + if (SLoadAddresses.count(Ptr)) { + addWait(Wait, LGKM_CNT, 0); + if (PDT->dominates(MI.getParent(), + SLoadAddresses.find(Ptr)->second)) + SLoadAddresses.erase(Ptr); + } unsigned AS = Memop->getAddrSpace(); if (AS != AMDGPUAS::LOCAL_ADDRESS) continue; @@ -1399,6 +1412,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } } + if (TII->isSMRD(Inst)) { + for (const MachineMemOperand *Memop : Inst.memoperands()) { + const Value *Ptr = Memop->getValue(); + SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent())); + } + } + // Generate an s_waitcnt instruction to be placed before // cur_Inst, if needed. Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); @@ -1448,6 +1468,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo(); + PDT = &getAnalysis(); ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) diff --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll new file mode 100644 index 00000000000000..4ba16b4eb30bea --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll @@ -0,0 +1,29 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN + +; GCN-LABEL: BB0_1 +; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0 +; GCN: s_waitcnt lgkmcnt(0) +; GCN: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off + +define amdgpu_kernel void @zot(i32 addrspace(1)* nocapture %arg, i64 addrspace(1)* nocapture %arg1) { +bb: + %tmp = call i32 @llvm.amdgcn.workitem.id.x() + %tmp2 = icmp eq i32 %tmp, 0 + br i1 %tmp2, label %bb3, label %bb8 + +bb3: ; preds = %bb + %tmp4 = load i32, i32 addrspace(1)* %arg, align 4 + store i32 0, i32 addrspace(1)* %arg, align 4 + %tmp5 = zext i32 %tmp4 to i64 + %tmp6 = load i64, i64 addrspace(1)* %arg1, align 8 + %tmp7 = add i64 %tmp6, %tmp5 + store i64 %tmp7, i64 addrspace(1)* %arg1, align 8 + br label %bb8 + +bb8: ; preds = %bb3, %bb + ret void +} +; Function Attrs: nounwind readnone speculatable +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone speculatable }