diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index d569a082cebe0..727a98c41bce4 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -1264,25 +1264,32 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI, // If we have a COPY with other uses in the loop, hoist to allow the users to // also be hoisted. - Register DefReg; - if (MI.isCopy() && (DefReg = MI.getOperand(0).getReg()).isVirtual() && - MI.getOperand(1).getReg().isVirtual() && - IsLoopInvariantInst(MI, CurLoop) && - any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()), - [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) { - if (!CurLoop->contains(&UseMI)) - return false; - - // COPY is a cheap instruction, but if moving it won't cause high - // RP we're fine to hoist it even if the user can't be hoisted - // later Otherwise we want to check the user if it's hoistable - if (CanCauseHighRegPressure(Cost, false) && - !CurLoop->isLoopInvariant(UseMI, DefReg)) - return false; - - return true; - })) - return true; + // TODO: Handle all isCopyLike? + if (MI.isCopy() || MI.isRegSequence()) { + Register DefReg = MI.getOperand(0).getReg(); + if (DefReg.isVirtual() && + all_of(MI.uses(), + [](const MachineOperand &UseOp) { + return !UseOp.isReg() || UseOp.getReg().isVirtual(); + }) && + IsLoopInvariantInst(MI, CurLoop) && + any_of(MRI->use_nodbg_instructions(DefReg), + [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) { + if (!CurLoop->contains(&UseMI)) + return false; + + // COPY is a cheap instruction, but if moving it won't cause + // high RP we're fine to hoist it even if the user can't be + // hoisted later Otherwise we want to check the user if it's + // hoistable + if (CanCauseHighRegPressure(Cost, false) && + !CurLoop->isLoopInvariant(UseMI, DefReg)) + return false; + + return true; + })) + return true; + } // High register pressure situation, only hoist if the instruction is going // to be remat'ed. diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index f5c2bd6286cb8..41a883302e8f7 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -8907,17 +8907,17 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) { ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v4, s3 ; SI-NEXT: v_mov_b32_e32 v5, s2 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: .LBB127_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir new file mode 100644 index 0000000000000..e9945f005d264 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/machinelicm-copy-like-instrs.mir @@ -0,0 +1,134 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -mtriple=amdgcn -run-pass=early-machinelicm -simplify-mir -o - %s | FileCheck %s + +# Test to check machine LICM does not hoist convergent instructions, +# DS_PERMUTE_B32 in this example. + +--- +name: licm_reg_sequence +body: | + ; CHECK-LABEL: name: licm_reg_sequence + ; CHECK: bb.0: + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]] + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0, $vgpr1 + successors: %bb.1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + + bb.1: + successors: %bb.1, %bb.2 + + %3:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1 + S_NOP 0, implicit %3 + S_CBRANCH_SCC1 %bb.1, implicit undef $scc + S_BRANCH %bb.2 + + bb.2: + $vgpr0 = COPY %3 + S_ENDPGM 0 + +... + +# Don't bother handling reg_sequence with physreg uses (is there any +# reason for these to be legal)? +--- +name: licm_reg_sequence_physreg_use +body: | + ; CHECK-LABEL: name: licm_reg_sequence_physreg_use + ; CHECK: bb.0: + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, $vgpr1, %subreg.sub1 + ; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]] + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0, $vgpr1 + successors: %bb.1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + + bb.1: + successors: %bb.1, %bb.2 + liveins: $vgpr0 + + %3:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, $vgpr1, %subreg.sub1 + S_NOP 0, implicit %3 + S_CBRANCH_SCC1 %bb.1, implicit undef $scc + S_BRANCH %bb.2 + + bb.2: + $vgpr0 = COPY %3 + S_ENDPGM 0 + +... + +--- +name: licm_insert_subreg +body: | + ; CHECK-LABEL: name: licm_insert_subreg + ; CHECK: bb.0: + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.sub0 + ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[INSERT_SUBREG]], [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: S_NOP 0, implicit [[INSERT_SUBREG1]] + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[INSERT_SUBREG1]] + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0, $vgpr1 + successors: %bb.1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + + bb.1: + successors: %bb.1, %bb.2 + + %3:vreg_64 = IMPLICIT_DEF + %4:vreg_64 = INSERT_SUBREG %3, %0, %subreg.sub0 + %5:vreg_64 = INSERT_SUBREG %4, %1, %subreg.sub1 + S_NOP 0, implicit %5 + S_CBRANCH_SCC1 %bb.1, implicit undef $scc + S_BRANCH %bb.2 + + bb.2: + $vgpr0_vgpr1 = COPY %5 + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index 7c351d2b8443b..a50a0766f67c2 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -8,9 +8,10 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_branch .LBB0_2 ; GCN-NEXT: .LBB0_1: ; %loop.exit.guard @@ -20,7 +21,6 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { ; GCN-NEXT: .LBB0_2: ; %bb1 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_4 Depth 2 -; GCN-NEXT: s_mov_b32 s11, s7 ; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v1 diff --git a/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll b/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll index 867ce3b930f8f..69ba266227265 100644 --- a/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll +++ b/llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll @@ -69,6 +69,7 @@ b18: ; preds = %b16, %b7 br label %b22 b21: ; preds = %b22 + store volatile <64 x i32> %v20, ptr null tail call void @sammy() #3 br label %b7