[AMDGPU] Eliminate no effect instructions before s_endpgm

Differential Revision: https://reviews.llvm.org/D36585 llvm-svn: 310987
llvm · Aug 16, 2017 · a9487d9 · a9487d9
1 parent 0c6374e
commit a9487d9
Show file tree

Hide file tree

Showing 15 changed files with 406 additions and 32 deletions.
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -111,9 +111,62 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
   const SIInstrInfo *TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+  DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
+
+    // Try to remove unneeded instructions before s_endpgm.
+    if (MBB.succ_empty()) {
+      if (MBB.empty() || MBB.back().getOpcode() != AMDGPU::S_ENDPGM)
+        continue;
+
+      SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
+
+      while (!Blocks.empty()) {
+        auto CurBB = Blocks.pop_back_val();
+        auto I = CurBB->rbegin(), E = CurBB->rend();
+        if (I != E) {
+          if (I->isUnconditionalBranch() || I->getOpcode() == AMDGPU::S_ENDPGM)
+            ++I;
+          else if (I->isBranch())
+            continue;
+        }
+
+        while (I != E) {
+          if (I->isDebugValue())
+            continue;
+          if (I->mayStore() || I->isBarrier() || I->isCall() ||
+              I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef())
+            break;
+
+          DEBUG(dbgs() << "Removing no effect instruction: " << *I << '\n');
+
+          for (auto &Op : I->operands())
+            if (Op.isReg())
+              RecalcRegs.insert(Op.getReg());
+
+          auto Next = std::next(I);
+          LIS->RemoveMachineInstrFromMaps(*I);
+          I->eraseFromParent();
+          I = Next;
+
+          Changed = true;
+        }
+
+        if (I != E)
+          continue;
+
+        // Try to ascend predecessors.
+        for (auto *Pred : CurBB->predecessors()) {
+          if (Pred->succ_size() == 1)
+            Blocks.push_back(Pred);
+        }
+      }
+      continue;
+    }
+
+    // Try to collapse adjacent endifs.
     auto Lead = MBB.begin(), E = MBB.end();
     if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI))
       continue;
@@ -174,9 +227,16 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
   }
 
   if (Changed) {
-    // Recompute liveness for both reg units of exec.
-    LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC_LO, TRI));
-    LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC_HI, TRI));
+    for (auto Reg : RecalcRegs) {
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        LIS->removeInterval(Reg);
+        if (!MRI.reg_empty(Reg))
+          LIS->createAndComputeVirtRegInterval(Reg);
+      } else {
+        for (MCRegUnitIterator U(Reg, TRI); U.isValid(); ++U)
+          LIS->removeRegUnit(*U);
+      }
+    }
   }
 
   return Changed;

diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -20,7 +20,6 @@
 ; GCN: ds_write_b32
 
 ; GCN: [[BB5]]
-; GCN: s_or_b64 exec, exec
 ; GCN-NEXT: s_endpgm
 ; GCN-NEXT: .Lfunc_end
 define amdgpu_ps void @ham(float %arg, float %arg1) #0 {

diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -9,7 +9,6 @@
 ; GCN-NEXT: {{^BB[0-9_]+}}:
 ; GCN:      store_dword
 ; GCN-NEXT: {{^}}[[ENDIF]]:
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @simple_nested_if(i32 addrspace(1)* nocapture %arg) {
 bb:
@@ -45,7 +44,6 @@ bb.outer.end:                                     ; preds = %bb.outer.then, %bb.
 ; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER]]
 ; GCN:      store_dword
 ; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @uncollapsable_nested_if(i32 addrspace(1)* nocapture %arg) {
 bb:
@@ -90,7 +88,6 @@ bb.outer.end:                                     ; preds = %bb.inner.then, %bb
 ; GCN-NEXT: ; mask branch [[ENDIF_OUTER]]
 ; GCN:      store_dword
 ; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @nested_if_if_else(i32 addrspace(1)* nocapture %arg) {
 bb:
@@ -141,13 +138,10 @@ bb.outer.end:                                        ; preds = %bb, %bb.then, %b
 ; GCN-NEXT: {{^BB[0-9_]+}}:
 ; GCN:      store_dword
 ; GCN-NEXT: s_and_saveexec_b64 [[SAVEEXEC_INNER_IF_OUTER_THEN:s\[[0-9:]+\]]]
-; GCN-NEXT: ; mask branch [[ENDIF_INNER_OUTER_THEN:BB[0-9_]+]]
+; GCN-NEXT: ; mask branch [[ENDIF_OUTER]]
 ; GCN-NEXT: {{^BB[0-9_]+}}:
 ; GCN:      store_dword
-; GCN-NEXT: {{^}}[[ENDIF_INNER_OUTER_THEN]]:
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_INNER_IF_OUTER_THEN]]
 ; GCN-NEXT: {{^}}[[ENDIF_OUTER]]:
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC_OUTER3]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @nested_if_else_if(i32 addrspace(1)* nocapture %arg) {
 bb:
@@ -183,6 +177,33 @@ bb.outer.end:
   ret void
 }
 
+; GCN-LABEL: {{^}}s_endpgm_unsafe_barrier:
+; GCN:      s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9:]+\]]]
+; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9_]+]]
+; GCN-NEXT: {{^BB[0-9_]+}}:
+; GCN:      store_dword
+; GCN-NEXT: {{^}}[[ENDIF]]:
+; GCN-NEXT: s_or_b64 exec, exec, [[SAVEEXEC]]
+; GCN:      s_barrier
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @s_endpgm_unsafe_barrier(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = icmp ugt i32 %tmp, 1
+  br i1 %tmp1, label %bb.then, label %bb.end
+
+bb.then:                                          ; preds = %bb
+  %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tmp
+  store i32 0, i32 addrspace(1)* %tmp4, align 4
+  br label %bb.end
+
+bb.end:                                           ; preds = %bb.then, %bb
+  call void @llvm.amdgcn.s.barrier()
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare void @llvm.amdgcn.s.barrier() #1
 
 attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind convergent }