diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index 6ab1d4550c51c..14885d5f9d08e 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -558,6 +558,24 @@ class GVecReduce : public GenericMachineInstr { } }; +/// Represents a G_PHI. +class GPhi : public GenericMachineInstr { +public: + /// Returns the number of incoming values. + unsigned getNumIncomingValues() const { return (getNumOperands() - 1) / 2; } + /// Returns the I'th incoming vreg. + Register getIncomingValue(unsigned I) { + return getOperand(I * 2 + 1).getReg(); + } + /// Returns the I'th incoming basic block. + MachineBasicBlock *getIncomingBlock(unsigned I) { + return getOperand(I * 2 + 2).getMBB(); + } + + static bool classof(const MachineInstr *MI) { + return MI->getOpcode() == TargetOpcode::G_PHI; + } +}; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h b/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h index b1fcdd207a60a..4fbff4d10f8ab 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h @@ -67,10 +67,9 @@ class Localizer : public MachineFunctionPass { typedef SmallSetVector LocalizedSetVecT; - /// If \p Op is a phi operand and not unique in that phi, that is, - /// there are other operands in the phi with the same register, - /// return true. - bool isNonUniquePhiValue(MachineOperand &Op) const; + /// If \p Op is a reg operand of a PHI, return the number of total + /// operands in the PHI that are the same as \p Op, including itself. + unsigned getNumPhiUses(MachineOperand &Op) const; /// Do inter-block localization from the entry block. bool localizeInterBlock(MachineFunction &MF, diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp index 55984423e5bc6..ae58e135931f4 100644 --- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp @@ -13,6 +13,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -58,18 +59,18 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def, return InsertMBB == Def.getParent(); } -bool Localizer::isNonUniquePhiValue(MachineOperand &Op) const { - MachineInstr *MI = Op.getParent(); - if (!MI->isPHI()) - return false; +unsigned Localizer::getNumPhiUses(MachineOperand &Op) const { + auto *MI = dyn_cast(&*Op.getParent()); + if (!MI) + return 0; Register SrcReg = Op.getReg(); - for (unsigned Idx = 1; Idx < MI->getNumOperands(); Idx += 2) { - auto &MO = MI->getOperand(Idx); - if (&MO != &Op && MO.isReg() && MO.getReg() == SrcReg) - return true; + unsigned NumUses = 0; + for (unsigned I = 0, NumVals = MI->getNumIncomingValues(); I < NumVals; ++I) { + if (MI->getIncomingValue(I) == SrcReg) + ++NumUses; } - return false; + return NumUses; } bool Localizer::localizeInterBlock(MachineFunction &MF, @@ -108,11 +109,12 @@ bool Localizer::localizeInterBlock(MachineFunction &MF, continue; } - // If the use is a phi operand that's not unique, don't try to localize. - // If we do, we can cause unnecessary instruction bloat by duplicating - // into each predecessor block, when the existing one is sufficient and - // allows for easier optimization later. - if (isNonUniquePhiValue(MOUse)) + // PHIs look like a single user but can use the same register in multiple + // edges, causing remat into each predecessor. Allow this to a certain + // extent. + unsigned NumPhiUses = getNumPhiUses(MOUse); + const unsigned PhiThreshold = 2; // FIXME: Tune this more. + if (NumPhiUses > PhiThreshold) continue; LLVM_DEBUG(dbgs() << "Fixing non-local use\n"); @@ -164,19 +166,22 @@ bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) { if (!UseMI.isPHI()) Users.insert(&UseMI); } - // If all the users were PHIs then they're not going to be in our block, - // don't try to move this instruction. - if (Users.empty()) - continue; - MachineBasicBlock::iterator II(MI); - ++II; - while (II != MBB.end() && !Users.count(&*II)) + // If all the users were PHIs then they're not going to be in our block, we + // may still benefit from sinking, especially since the value might be live + // across a call. + if (Users.empty()) { + // Make sure we don't sink in between two terminator sequences by scanning + // forward, not backward. + II = MBB.getFirstTerminatorForward(); + LLVM_DEBUG(dbgs() << "Only phi users: moving inst to end: " << *MI); + } else { ++II; - - assert(II != MBB.end() && "Didn't find the user in the MBB"); - LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II - << '\n'); + while (II != MBB.end() && !Users.count(&*II)) + ++II; + assert(II != MBB.end() && "Didn't find the user in the MBB"); + LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II); + } MI->removeFromParent(); MBB.insert(II, MI); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll b/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll index 007e1fb3d63da..39ad002a0763f 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll @@ -12,10 +12,10 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0 ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11 ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 - ; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 + ; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 ; CHECK-NEXT: G_INVOKE_REGION_START ; CHECK-NEXT: EH_LABEL @@ -29,7 +29,7 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0 ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: liveins: $x0, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1 ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var ; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var) @@ -67,12 +67,12 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11 ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 - ; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 + ; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8) from %ir.ptr) ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s8) = G_ASSERT_ZEXT [[LOAD]], 1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11 ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[ASSERT_ZEXT]](s8) ; CHECK-NEXT: G_INVOKE_REGION_START ; CHECK-NEXT: EH_LABEL @@ -86,7 +86,7 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: liveins: $x0, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1 ; CHECK-NEXT: EH_LABEL ; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var ; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll index 5867326c18aa6..1602480ea3e0d 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll @@ -46,16 +46,16 @@ define i32 @test(i32 %a, i1 %c) { ; PRESELECTION-NEXT: {{ $}} ; PRESELECTION-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0 ; PRESELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr(s32) = COPY $w1 - ; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 - ; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000 - ; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C1]] + ; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000 + ; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C]] + ; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 ; PRESELECTION-NEXT: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1 ; PRESELECTION-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[COPY1]], [[C2]] ; PRESELECTION-NEXT: G_BRCOND [[AND]](s32), %bb.3 ; PRESELECTION-NEXT: G_BR %bb.2 ; PRESELECTION-NEXT: {{ $}} ; PRESELECTION-NEXT: bb.2.common.ret: - ; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C]](s32), %bb.1 + ; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C1]](s32), %bb.1 ; PRESELECTION-NEXT: $w0 = COPY [[PHI]](s32) ; PRESELECTION-NEXT: RET_ReallyLR implicit $w0 ; PRESELECTION-NEXT: {{ $}} @@ -75,8 +75,8 @@ define i32 @test(i32 %a, i1 %c) { ; POSTSELECTION-NEXT: {{ $}} ; POSTSELECTION-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 ; POSTSELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 - ; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr ; POSTSELECTION-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 100000 + ; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr ; POSTSELECTION-NEXT: TBNZW [[COPY1]], 0, %bb.3 ; POSTSELECTION-NEXT: B %bb.2 ; POSTSELECTION-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir b/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir index 90580c847f290..942844e0d0444 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir @@ -56,7 +56,9 @@ define void @test_inttoptr() { ret void } define void @many_local_use_intra_block() { ret void } - define void @non_local_phi_use_nonunique() { ret void } + define void @non_local_phi_single_use() { ret void } + define void @non_local_phi_three_uses() { ret void } + ... --- @@ -285,8 +287,8 @@ body: | ; CHECK: bb.1: ; CHECK: successors: %bb.1(0x80000000) ; CHECK: [[PHI:%[0-9]+]]:fpr(s32) = PHI [[FADD]](s32), %bb.0, %4(s32), %bb.1 - ; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00 ; CHECK: [[FADD1:%[0-9]+]]:fpr(s32) = G_FADD [[PHI]], [[FADD]] + ; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00 ; CHECK: G_BR %bb.1 ; Existing registers should be left untouched @@ -566,12 +568,12 @@ body: | ... --- -name: non_local_phi_use_nonunique +name: non_local_phi_single_use legalized: true regBankSelected: true tracksRegLiveness: true body: | - ; CHECK-LABEL: name: non_local_phi_use_nonunique + ; CHECK-LABEL: name: non_local_phi_single_use ; CHECK: bb.0: ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1 @@ -582,12 +584,12 @@ body: | ; CHECK: G_BR %bb.2 ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1 ; CHECK: bb.2: - ; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C2]](s32), %bb.1 ; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]] - ; Don't localize the 1 into bb.1, because there are multiple edges - ; using that register. + ; Localize the 1 into bb.1, since the number of uses is under the threshold. bb.0: successors: %bb.1, %bb.2 @@ -606,3 +608,43 @@ body: | %3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0 %2:gpr(s32) = G_ADD %3, %3 ... +--- +name: non_local_phi_three_uses +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: non_local_phi_three_uses + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ADD:%[0-9]+]]:gpr(s32) = G_ADD [[C]], [[C]] + ; CHECK: %cmp:gpr(s32) = G_ICMP intpred(eq), [[ADD]](s32), [[C]] + ; CHECK: %cond:gpr(s1) = G_TRUNC %cmp(s32) + ; CHECK: G_BRCOND %cond(s1), %bb.1 + ; CHECK: G_BR %bb.2 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: bb.2: + ; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1 + ; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]] + + ; Don't localize the 1 into bb.1, above the thresold of uses in the phi. + + bb.0: + successors: %bb.1, %bb.2 + + %0:gpr(s32) = G_CONSTANT i32 1 + %1:gpr(s32) = G_ADD %0, %0 + %cmp:gpr(s32) = G_ICMP intpred(eq), %1(s32), %0 + %cond:gpr(s1) = G_TRUNC %cmp(s32) + G_BRCOND %cond(s1), %bb.1 + G_BR %bb.2 + + bb.1: + successors: %bb.2 + + bb.2: + %3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0, %0(s32), %bb.0, %0(s32), %bb.0 + %2:gpr(s32) = G_ADD %3, %3 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index 4ac1fad6deecd..7a68aec1a1c55 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -230,32 +230,32 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 ; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 1 -; GFX10-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 +; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 ; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen +; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 ; GFX10-NEXT: ; implicit-def: $vgpr3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 ; GFX10-NEXT: s_cbranch_vccnz .LBB4_4 ; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader -; GFX10-NEXT: v_mov_b32_e32 v4, s12 ; GFX10-NEXT: v_mov_b32_e32 v3, s12 +; GFX10-NEXT: v_mov_b32_e32 v4, s12 ; GFX10-NEXT: .LBB4_2: ; %.preheader ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: buffer_load_dword v5, v4, s[4:7], 0 offen -; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v4, 4, v4 -; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: buffer_load_dword v5, v3, s[4:7], 0 offen +; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v3 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v3, v5, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, v5, v4 ; GFX10-NEXT: s_cbranch_vccnz .LBB4_2 ; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 ; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 043e69abaeef2..a2c762d044b3f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -48,24 +48,24 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x ; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0 ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: s_mov_b32 s4, 1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s2, 56 -; GCN-NEXT: s_cselect_b32 s2, 1, 0 +; GCN-NEXT: s_cselect_b32 s4, 1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, 42 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_mov_b32 s2, 1 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %.one ; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s4, 0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: .LBB2_2: ; %Flow -; GCN-NEXT: s_xor_b32 s2, s4, 1 +; GCN-NEXT: s_xor_b32 s2, s2, 1 ; GCN-NEXT: s_and_b32 s2, s2, 1 ; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB2_4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll index a4d5fe4ffa5a7..4d4da869d7507 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll @@ -36,11 +36,11 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src ; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[4:5] ; LOOP-NEXT: s_cbranch_execz .LBB0_6 ; LOOP-NEXT: ; %bb.4: ; %copy_backwards -; LOOP-NEXT: s_mov_b32 s0, -4 ; LOOP-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; LOOP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; LOOP-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; LOOP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; LOOP-NEXT: s_mov_b32 s0, -4 ; LOOP-NEXT: s_mov_b32 s6, 0 ; LOOP-NEXT: s_mov_b32 s7, 0xf000 ; LOOP-NEXT: s_mov_b64 s[4:5], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index da9601a8998c2..36bac87889cac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -31,9 +31,9 @@ define amdgpu_kernel void @localize_constants(i1 %cond) { ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: .LBB0_2: ; %Flow ; GFX9-NEXT: s_xor_b32 s0, s0, 1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 3add708d1a639..887c43f5fce59 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -186,12 +186,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_udiv_i64: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3] +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s4, 1 -; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3] -; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: s_mov_b32 s9, -1 -; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2 ; CHECK-NEXT: s_cbranch_vccz .LBB1_2 ; CHECK-NEXT: ; %bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 12df4b7c7fc33..5c6bb6dea1646 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -183,12 +183,12 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-LABEL: s_urem_i64: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3] +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s7, -1 +; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0 ; CHECK-NEXT: s_mov_b32 s4, 1 -; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3] -; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: s_mov_b32 s9, -1 -; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] -; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[6:7], 0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2 ; CHECK-NEXT: s_cbranch_vccz .LBB1_2 ; CHECK-NEXT: ; %bb.1: