Skip to content

Commit

Permalink
[GlobalISel][Localizer] Allow localization of a small number of repea…
Browse files Browse the repository at this point in the history
…ted phi uses. (#77566)

We previously had a heuristic that if a value V was used multiple times
in a single PHI, then to avoid potentially rematerializing into many predecessors
we bail out. The phi uses only counted as a single use in the shouldLocalize() hook
because it counted the PHI as a single instruction use, not factoring in it may
have many incoming edges.

It turns out this heuristic is slightly too pessimistic, and allowing a small number
of these uses to be localized can improve code size due to shortening live ranges,
especially if those ranges span a call.

This change results in some improvements in size on CTMark -Os:
```
Program                                       size.__text
                                              before         after           diff
kimwitu++/kc                                  451676.00      451860.00       0.0%
mafft/pairlocalalign                          241460.00      241540.00       0.0%
tramp3d-v4/tramp3d-v4                         389216.00      389208.00      -0.0%
7zip/7zip-benchmark                           587528.00      587464.00      -0.0%
Bullet/bullet                                 457424.00      457348.00      -0.0%
consumer-typeset/consumer-typeset             405472.00      405376.00      -0.0%
SPASS/SPASS                                   410288.00      410120.00      -0.0%
lencod/lencod                                 426396.00      426108.00      -0.1%
ClamAV/clamscan                               380108.00      379756.00      -0.1%
sqlite3/sqlite3                               283664.00      283372.00      -0.1%
                           Geomean difference                               -0.0%
```
I experimented with different variations and thresholds. Using 3 instead
of 2 resulted in a further 0.1% improvement on ClamAV but also regressed
sqlite3 by the same %.
  • Loading branch information
aemerson committed Jan 11, 2024
1 parent e4e0b65 commit bbbe8ec
Show file tree
Hide file tree
Showing 12 changed files with 142 additions and 78 deletions.
18 changes: 18 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,24 @@ class GVecReduce : public GenericMachineInstr {
}
};

/// Represents a G_PHI.
class GPhi : public GenericMachineInstr {
public:
/// Returns the number of incoming values.
unsigned getNumIncomingValues() const { return (getNumOperands() - 1) / 2; }
/// Returns the I'th incoming vreg.
Register getIncomingValue(unsigned I) {
return getOperand(I * 2 + 1).getReg();
}
/// Returns the I'th incoming basic block.
MachineBasicBlock *getIncomingBlock(unsigned I) {
return getOperand(I * 2 + 2).getMBB();
}

static bool classof(const MachineInstr *MI) {
return MI->getOpcode() == TargetOpcode::G_PHI;
}
};

} // namespace llvm

Expand Down
7 changes: 3 additions & 4 deletions llvm/include/llvm/CodeGen/GlobalISel/Localizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,9 @@ class Localizer : public MachineFunctionPass {

typedef SmallSetVector<MachineInstr *, 32> LocalizedSetVecT;

/// If \p Op is a phi operand and not unique in that phi, that is,
/// there are other operands in the phi with the same register,
/// return true.
bool isNonUniquePhiValue(MachineOperand &Op) const;
/// If \p Op is a reg operand of a PHI, return the number of total
/// operands in the PHI that are the same as \p Op, including itself.
unsigned getNumPhiUses(MachineOperand &Op) const;

/// Do inter-block localization from the entry block.
bool localizeInterBlock(MachineFunction &MF,
Expand Down
55 changes: 30 additions & 25 deletions llvm/lib/CodeGen/GlobalISel/Localizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
Expand Down Expand Up @@ -58,18 +59,18 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
return InsertMBB == Def.getParent();
}

bool Localizer::isNonUniquePhiValue(MachineOperand &Op) const {
MachineInstr *MI = Op.getParent();
if (!MI->isPHI())
return false;
unsigned Localizer::getNumPhiUses(MachineOperand &Op) const {
auto *MI = dyn_cast<GPhi>(&*Op.getParent());
if (!MI)
return 0;

Register SrcReg = Op.getReg();
for (unsigned Idx = 1; Idx < MI->getNumOperands(); Idx += 2) {
auto &MO = MI->getOperand(Idx);
if (&MO != &Op && MO.isReg() && MO.getReg() == SrcReg)
return true;
unsigned NumUses = 0;
for (unsigned I = 0, NumVals = MI->getNumIncomingValues(); I < NumVals; ++I) {
if (MI->getIncomingValue(I) == SrcReg)
++NumUses;
}
return false;
return NumUses;
}

bool Localizer::localizeInterBlock(MachineFunction &MF,
Expand Down Expand Up @@ -108,11 +109,12 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
continue;
}

// If the use is a phi operand that's not unique, don't try to localize.
// If we do, we can cause unnecessary instruction bloat by duplicating
// into each predecessor block, when the existing one is sufficient and
// allows for easier optimization later.
if (isNonUniquePhiValue(MOUse))
// PHIs look like a single user but can use the same register in multiple
// edges, causing remat into each predecessor. Allow this to a certain
// extent.
unsigned NumPhiUses = getNumPhiUses(MOUse);
const unsigned PhiThreshold = 2; // FIXME: Tune this more.
if (NumPhiUses > PhiThreshold)
continue;

LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
Expand Down Expand Up @@ -164,19 +166,22 @@ bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) {
if (!UseMI.isPHI())
Users.insert(&UseMI);
}
// If all the users were PHIs then they're not going to be in our block,
// don't try to move this instruction.
if (Users.empty())
continue;

MachineBasicBlock::iterator II(MI);
++II;
while (II != MBB.end() && !Users.count(&*II))
// If all the users were PHIs then they're not going to be in our block, we
// may still benefit from sinking, especially since the value might be live
// across a call.
if (Users.empty()) {
// Make sure we don't sink in between two terminator sequences by scanning
// forward, not backward.
II = MBB.getFirstTerminatorForward();
LLVM_DEBUG(dbgs() << "Only phi users: moving inst to end: " << *MI);
} else {
++II;

assert(II != MBB.end() && "Didn't find the user in the MBB");
LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II
<< '\n');
while (II != MBB.end() && !Users.count(&*II))
++II;
assert(II != MBB.end() && "Didn't find the user in the MBB");
LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II);
}

MI->removeFromParent();
MBB.insert(II, MI);
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0
; CHECK: bb.1 (%ir-block.0):
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; CHECK-NEXT: G_INVOKE_REGION_START
; CHECK-NEXT: EH_LABEL <mcsymbol >
Expand All @@ -29,7 +29,7 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var)
Expand Down Expand Up @@ -67,12 +67,12 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe
; CHECK-NEXT: liveins: $x0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C1]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
; CHECK-NEXT: G_STORE [[C]](s32), [[GV]](p0) :: (store (s32) into @global_var)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8) from %ir.ptr)
; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s8) = G_ASSERT_ZEXT [[LOAD]], 1
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[ASSERT_ZEXT]](s8)
; CHECK-NEXT: G_INVOKE_REGION_START
; CHECK-NEXT: EH_LABEL <mcsymbol >
Expand All @@ -86,7 +86,7 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.1
; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1
; CHECK-NEXT: EH_LABEL <mcsymbol >
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @global_var
; CHECK-NEXT: G_STORE [[PHI]](s32), [[GV1]](p0) :: (store (s32) into @global_var)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,16 @@ define i32 @test(i32 %a, i1 %c) {
; PRESELECTION-NEXT: {{ $}}
; PRESELECTION-NEXT: [[COPY:%[0-9]+]]:gpr(s32) = COPY $w0
; PRESELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr(s32) = COPY $w1
; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C1]]
; PRESELECTION-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
; PRESELECTION-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
; PRESELECTION-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
; PRESELECTION-NEXT: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; PRESELECTION-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[COPY1]], [[C2]]
; PRESELECTION-NEXT: G_BRCOND [[AND]](s32), %bb.3
; PRESELECTION-NEXT: G_BR %bb.2
; PRESELECTION-NEXT: {{ $}}
; PRESELECTION-NEXT: bb.2.common.ret:
; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C]](s32), %bb.1
; PRESELECTION-NEXT: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI %7(s32), %bb.3, [[C1]](s32), %bb.1
; PRESELECTION-NEXT: $w0 = COPY [[PHI]](s32)
; PRESELECTION-NEXT: RET_ReallyLR implicit $w0
; PRESELECTION-NEXT: {{ $}}
Expand All @@ -75,8 +75,8 @@ define i32 @test(i32 %a, i1 %c) {
; POSTSELECTION-NEXT: {{ $}}
; POSTSELECTION-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
; POSTSELECTION-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr
; POSTSELECTION-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 100000
; POSTSELECTION-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $wzr
; POSTSELECTION-NEXT: TBNZW [[COPY1]], 0, %bb.3
; POSTSELECTION-NEXT: B %bb.2
; POSTSELECTION-NEXT: {{ $}}
Expand Down
56 changes: 49 additions & 7 deletions llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@

define void @test_inttoptr() { ret void }
define void @many_local_use_intra_block() { ret void }
define void @non_local_phi_use_nonunique() { ret void }
define void @non_local_phi_single_use() { ret void }
define void @non_local_phi_three_uses() { ret void }

...

---
Expand Down Expand Up @@ -285,8 +287,8 @@ body: |
; CHECK: bb.1:
; CHECK: successors: %bb.1(0x80000000)
; CHECK: [[PHI:%[0-9]+]]:fpr(s32) = PHI [[FADD]](s32), %bb.0, %4(s32), %bb.1
; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00
; CHECK: [[FADD1:%[0-9]+]]:fpr(s32) = G_FADD [[PHI]], [[FADD]]
; CHECK: [[C1:%[0-9]+]]:fpr(s32) = G_FCONSTANT float 1.000000e+00
; CHECK: G_BR %bb.1
; Existing registers should be left untouched
Expand Down Expand Up @@ -566,12 +568,12 @@ body: |
...

---
name: non_local_phi_use_nonunique
name: non_local_phi_single_use
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: non_local_phi_use_nonunique
; CHECK-LABEL: name: non_local_phi_single_use
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
Expand All @@ -582,12 +584,12 @@ body: |
; CHECK: G_BR %bb.2
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; CHECK: bb.2:
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C2]](s32), %bb.1
; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]]
; Don't localize the 1 into bb.1, because there are multiple edges
; using that register.
; Localize the 1 into bb.1, since the number of uses is under the threshold.
bb.0:
successors: %bb.1, %bb.2
Expand All @@ -606,3 +608,43 @@ body: |
%3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0
%2:gpr(s32) = G_ADD %3, %3
...
---
name: non_local_phi_three_uses
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: non_local_phi_three_uses
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
; CHECK: [[ADD:%[0-9]+]]:gpr(s32) = G_ADD [[C]], [[C]]
; CHECK: %cmp:gpr(s32) = G_ICMP intpred(eq), [[ADD]](s32), [[C]]
; CHECK: %cond:gpr(s1) = G_TRUNC %cmp(s32)
; CHECK: G_BRCOND %cond(s1), %bb.1
; CHECK: G_BR %bb.2
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: bb.2:
; CHECK: [[PHI:%[0-9]+]]:gpr(s32) = G_PHI [[C]](s32), %bb.1
; CHECK: [[ADD1:%[0-9]+]]:gpr(s32) = G_ADD [[PHI]], [[PHI]]
; Don't localize the 1 into bb.1, above the thresold of uses in the phi.
bb.0:
successors: %bb.1, %bb.2
%0:gpr(s32) = G_CONSTANT i32 1
%1:gpr(s32) = G_ADD %0, %0
%cmp:gpr(s32) = G_ICMP intpred(eq), %1(s32), %0
%cond:gpr(s1) = G_TRUNC %cmp(s32)
G_BRCOND %cond(s1), %bb.1
G_BR %bb.2
bb.1:
successors: %bb.2
bb.2:
%3:gpr(s32) = G_PHI %0(s32), %bb.1, %0(s32), %bb.0, %0(s32), %bb.0, %0(s32), %bb.0
%2:gpr(s32) = G_ADD %3, %3
...
Original file line number Diff line number Diff line change
Expand Up @@ -230,32 +230,32 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s2, 1
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
; GFX10-NEXT: v_and_b32_e32 v3, 1, v2
; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen
; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GFX10-NEXT: ; implicit-def: $vgpr3
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2
; GFX10-NEXT: s_cbranch_vccnz .LBB4_4
; GFX10-NEXT: ; %bb.1: ; %.preheader.preheader
; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: v_mov_b32_e32 v3, s12
; GFX10-NEXT: v_mov_b32_e32 v4, s12
; GFX10-NEXT: .LBB4_2: ; %.preheader
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: buffer_load_dword v5, v4, s[4:7], 0 offen
; GFX10-NEXT: v_add_nc_u32_e32 v2, -1, v2
; GFX10-NEXT: v_add_nc_u32_e32 v4, 4, v4
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX10-NEXT: buffer_load_dword v5, v3, s[4:7], 0 offen
; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v3
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v3, v5, v3
; GFX10-NEXT: v_add_nc_u32_e32 v4, v5, v4
; GFX10-NEXT: s_cbranch_vccnz .LBB4_2
; GFX10-NEXT: ; %bb.3: ; %.preheader._crit_edge
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
; GFX10-NEXT: s_or_b32 s2, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: s_mov_b32 s2, 0
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,24 +48,24 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s4, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s2, 56
; GCN-NEXT: s_cselect_b32 s2, 1, 0
; GCN-NEXT: s_cselect_b32 s4, 1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_mov_b32 s2, 1
; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_cbranch_scc0 .LBB2_2
; GCN-NEXT: ; %bb.1: ; %.one
; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s4, 0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: .LBB2_2: ; %Flow
; GCN-NEXT: s_xor_b32 s2, s4, 1
; GCN-NEXT: s_xor_b32 s2, s2, 1
; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_4
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[4:5]
; LOOP-NEXT: s_cbranch_execz .LBB0_6
; LOOP-NEXT: ; %bb.4: ; %copy_backwards
; LOOP-NEXT: s_mov_b32 s0, -4
; LOOP-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; LOOP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; LOOP-NEXT: v_add_i32_e32 v2, vcc, 3, v2
; LOOP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; LOOP-NEXT: s_mov_b32 s0, -4
; LOOP-NEXT: s_mov_b32 s6, 0
; LOOP-NEXT: s_mov_b32 s7, 0xf000
; LOOP-NEXT: s_mov_b64 s[4:5], 0
Expand Down

0 comments on commit bbbe8ec

Please sign in to comment.