-
Notifications
You must be signed in to change notification settings - Fork 10.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
MachineLICM: Allow hoisting REG_SEQUENCE #90638
Conversation
This is just a fancy copy. Extend the copy handling to cover reg_sequence with only virtual register uses. This avoids some test regressions in a future commit.
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis is just a fancy copy. Extend the copy handling to cover Full diff: https://github.com/llvm/llvm-project/pull/90638.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index d569a082cebe06..727a98c41bce4c 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1264,25 +1264,32 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
// If we have a COPY with other uses in the loop, hoist to allow the users to
// also be hoisted.
- Register DefReg;
- if (MI.isCopy() && (DefReg = MI.getOperand(0).getReg()).isVirtual() &&
- MI.getOperand(1).getReg().isVirtual() &&
- IsLoopInvariantInst(MI, CurLoop) &&
- any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
- [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
- if (!CurLoop->contains(&UseMI))
- return false;
-
- // COPY is a cheap instruction, but if moving it won't cause high
- // RP we're fine to hoist it even if the user can't be hoisted
- // later Otherwise we want to check the user if it's hoistable
- if (CanCauseHighRegPressure(Cost, false) &&
- !CurLoop->isLoopInvariant(UseMI, DefReg))
- return false;
-
- return true;
- }))
- return true;
+ // TODO: Handle all isCopyLike?
+ if (MI.isCopy() || MI.isRegSequence()) {
+ Register DefReg = MI.getOperand(0).getReg();
+ if (DefReg.isVirtual() &&
+ all_of(MI.uses(),
+ [](const MachineOperand &UseOp) {
+ return !UseOp.isReg() || UseOp.getReg().isVirtual();
+ }) &&
+ IsLoopInvariantInst(MI, CurLoop) &&
+ any_of(MRI->use_nodbg_instructions(DefReg),
+ [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
+ if (!CurLoop->contains(&UseMI))
+ return false;
+
+ // COPY is a cheap instruction, but if moving it won't cause
+ // high RP we're fine to hoist it even if the user can't be
+ // hoisted later Otherwise we want to check the user if it's
+ // hoistable
+ if (CanCauseHighRegPressure(Cost, false) &&
+ !CurLoop->isLoopInvariant(UseMI, DefReg))
+ return false;
+
+ return true;
+ }))
+ return true;
+ }
// High register pressure situation, only hoist if the instruction is going
// to be remat'ed.
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index f5c2bd6286cb8e..41a883302e8f70 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -8907,17 +8907,17 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: v_mov_b32_e32 v4, s3
; SI-NEXT: v_mov_b32_e32 v5, s2
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s10
-; SI-NEXT: v_mov_b32_e32 v3, s11
+; SI-NEXT: v_mov_b32_e32 v2, s4
+; SI-NEXT: v_mov_b32_e32 v3, s5
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: .LBB127_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
new file mode 100644
index 00000000000000..e9945f005d2645
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir
@@ -0,0 +1,134 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -run-pass=early-machinelicm -simplify-mir -o - %s | FileCheck %s
+
+# Test to check machine LICM does not hoist convergent instructions,
+# DS_PERMUTE_B32 in this example.
+
+---
+name: licm_reg_sequence
+body: |
+ ; CHECK-LABEL: name: licm_reg_sequence
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]]
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ successors: %bb.1
+
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ %3:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+ S_NOP 0, implicit %3
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ $vgpr0 = COPY %3
+ S_ENDPGM 0
+
+...
+
+# Don't bother handling reg_sequence with physreg uses (is there any
+# reason for these to be legal)?
+---
+name: licm_reg_sequence_physreg_use
+body: |
+ ; CHECK-LABEL: name: licm_reg_sequence_physreg_use
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, $vgpr1, %subreg.sub1
+ ; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]]
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ successors: %bb.1
+
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+ liveins: $vgpr0
+
+ %3:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, $vgpr1, %subreg.sub1
+ S_NOP 0, implicit %3
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ $vgpr0 = COPY %3
+ S_ENDPGM 0
+
+...
+
+---
+name: licm_insert_subreg
+body: |
+ ; CHECK-LABEL: name: licm_insert_subreg
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.sub0
+ ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[INSERT_SUBREG]], [[COPY1]], %subreg.sub1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: S_NOP 0, implicit [[INSERT_SUBREG1]]
+ ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[INSERT_SUBREG1]]
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+ successors: %bb.1
+
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ %3:vreg_64 = IMPLICIT_DEF
+ %4:vreg_64 = INSERT_SUBREG %3, %0, %subreg.sub0
+ %5:vreg_64 = INSERT_SUBREG %4, %1, %subreg.sub1
+ S_NOP 0, implicit %5
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ $vgpr0_vgpr1 = COPY %5
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index 7c351d2b8443b1..a50a0766f67c2c 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -8,9 +8,10 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s10, -1
; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s9, s5
; GCN-NEXT: s_mov_b32 s8, s4
+; GCN-NEXT: s_mov_b32 s9, s5
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_branch .LBB0_2
; GCN-NEXT: .LBB0_1: ; %loop.exit.guard
@@ -20,7 +21,6 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
; GCN-NEXT: .LBB0_2: ; %bb1
; GCN-NEXT: ; =>This Loop Header: Depth=1
; GCN-NEXT: ; Child Loop BB0_4 Depth 2
-; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v1
diff --git a/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll b/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
index 867ce3b930f8fc..69ba266227265c 100644
--- a/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
+++ b/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll
@@ -69,6 +69,7 @@ b18: ; preds = %b16, %b7
br label %b22
b21: ; preds = %b22
+ store volatile <64 x i32> %v20, ptr null
tail call void @sammy() #3
br label %b7
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Is the change to the Hexagon input expected, though?
@@ -69,6 +69,7 @@ b18: ; preds = %b16, %b7 | |||
br label %b22 | |||
|
|||
b21: ; preds = %b22 | |||
store volatile <64 x i32> %v20, ptr null |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unrelated change?
Yes, otherwise the spill it seemed to be looking for disappears so this defeats the optimization |
PAIR128 should probably just be removed entirely Depends #90638
This is just a fancy copy. Extend the copy handling to cover
reg_sequence with only virtual register uses. This avoids some
test regressions in a future commit.