Skip to content

Commit

Permalink
Bias physical register immediate assignments
Browse files Browse the repository at this point in the history
The machine scheduler currently biases register copies to/from
physical registers to be closer to their point of use / def to
minimize their live ranges. This change extends this to also physical
register assignments from immediate values.

This causes a reduction in reduction in overall register pressure and
minor reduction in spills and indirectly fixes an out-of-registers
assertion (PR39391).

Most test changes are from minor instruction reorderings and register
name selection changes and direct consequences of that.

Reviewers: MatzeB, qcolombet, myatsina, pcc

Subscribers: nemanjai, jvesely, nhaehnle, eraman, hiraditya,
  javed.absar, arphaman, jfb, jsji, llvm-commits

Differential Revision: https://reviews.llvm.org/D54218

llvm-svn: 346894
  • Loading branch information
niravhdave committed Nov 14, 2018
1 parent b048605 commit 1241dcb
Show file tree
Hide file tree
Showing 42 changed files with 443 additions and 338 deletions.
6 changes: 3 additions & 3 deletions llvm/include/llvm/CodeGen/MachineScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,7 @@ class GenericSchedulerBase : public MachineSchedStrategy {
/// Represent the type of SchedCandidate found within a single queue.
/// pickNodeBidirectional depends on these listed by decreasing priority.
enum CandReason : uint8_t {
NoCand, Only1, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak,
NoCand, Only1, PhysReg, RegExcess, RegCritical, Stall, Cluster, Weak,
RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};

Expand Down Expand Up @@ -928,7 +928,7 @@ bool tryPressure(const PressureChange &TryP,
const TargetRegisterInfo *TRI,
const MachineFunction &MF);
unsigned getWeakLeft(const SUnit *SU, bool isTop);
int biasPhysRegCopy(const SUnit *SU, bool isTop);
int biasPhysReg(const SUnit *SU, bool isTop);

/// GenericScheduler shrinks the unscheduled zone using heuristics to balance
/// the schedule.
Expand Down Expand Up @@ -1006,7 +1006,7 @@ class GenericScheduler : public GenericSchedulerBase {
const RegPressureTracker &RPTracker,
SchedCandidate &Candidate);

void reschedulePhysRegCopies(SUnit *SU, bool isTop);
void reschedulePhysReg(SUnit *SU, bool isTop);
};

/// PostGenericScheduler - Interface to the scheduling algorithm used by
Expand Down
67 changes: 42 additions & 25 deletions llvm/lib/CodeGen/MachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2516,7 +2516,7 @@ const char *GenericSchedulerBase::getReasonStr(
switch (Reason) {
case NoCand: return "NOCAND ";
case Only1: return "ONLY1 ";
case PhysRegCopy: return "PREG-COPY ";
case PhysReg: return "PHYS-REG ";
case RegExcess: return "REG-EXCESS";
case RegCritical: return "REG-CRIT ";
case Stall: return "STALL ";
Expand Down Expand Up @@ -2852,24 +2852,41 @@ unsigned getWeakLeft(const SUnit *SU, bool isTop) {
/// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
/// with the operation that produces or consumes the physreg. We'll do this when
/// regalloc has support for parallel copies.
int biasPhysRegCopy(const SUnit *SU, bool isTop) {
int biasPhysReg(const SUnit *SU, bool isTop) {
const MachineInstr *MI = SU->getInstr();
if (!MI->isCopy())
return 0;

unsigned ScheduledOper = isTop ? 1 : 0;
unsigned UnscheduledOper = isTop ? 0 : 1;
// If we have already scheduled the physreg produce/consumer, immediately
// schedule the copy.
if (TargetRegisterInfo::isPhysicalRegister(
MI->getOperand(ScheduledOper).getReg()))
return 1;
// If the physreg is at the boundary, defer it. Otherwise schedule it
// immediately to free the dependent. We can hoist the copy later.
bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
if (TargetRegisterInfo::isPhysicalRegister(
MI->getOperand(UnscheduledOper).getReg()))
return AtBoundary ? -1 : 1;
if (MI->isCopy()) {
unsigned ScheduledOper = isTop ? 1 : 0;
unsigned UnscheduledOper = isTop ? 0 : 1;
// If we have already scheduled the physreg produce/consumer, immediately
// schedule the copy.
if (TargetRegisterInfo::isPhysicalRegister(
MI->getOperand(ScheduledOper).getReg()))
return 1;
// If the physreg is at the boundary, defer it. Otherwise schedule it
// immediately to free the dependent. We can hoist the copy later.
bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
if (TargetRegisterInfo::isPhysicalRegister(
MI->getOperand(UnscheduledOper).getReg()))
return AtBoundary ? -1 : 1;
}

if (MI->isMoveImmediate()) {
// If we have a move immediate and all successors have been assigned, bias
// towards scheduling this later. Make sure all register defs are to
// physical registers.
bool DoBias = true;
for (const MachineOperand &Op : MI->defs()) {
if (Op.isReg() && !TargetRegisterInfo::isPhysicalRegister(Op.getReg())) {
DoBias = false;
break;
}
}

if (DoBias)
return isTop ? -1 : 1;
}

return 0;
}
} // end namespace llvm
Expand Down Expand Up @@ -2930,9 +2947,9 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
return;
}

if (tryGreater(biasPhysRegCopy(TryCand.SU, TryCand.AtTop),
biasPhysRegCopy(Cand.SU, Cand.AtTop),
TryCand, Cand, PhysRegCopy))
// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return;

// Avoid exceeding the target's limit.
Expand Down Expand Up @@ -3179,7 +3196,7 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
return SU;
}

void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
MachineBasicBlock::iterator InsertPos = SU->getInstr();
if (!isTop)
++InsertPos;
Expand All @@ -3194,7 +3211,7 @@ void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
continue;
MachineInstr *Copy = DepSU->getInstr();
if (!Copy->isCopy())
if (!Copy->isCopy() && !Copy->isMoveImmediate())
continue;
LLVM_DEBUG(dbgs() << " Rescheduling physreg copy ";
DAG->dumpNode(*Dep.getSUnit()));
Expand All @@ -3208,18 +3225,18 @@ void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
/// does.
///
/// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
/// them here. See comments in biasPhysRegCopy.
/// them here. See comments in biasPhysReg.
void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
if (IsTopNode) {
SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
Top.bumpNode(SU);
if (SU->hasPhysRegUses)
reschedulePhysRegCopies(SU, true);
reschedulePhysReg(SU, true);
} else {
SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
Bot.bumpNode(SU);
if (SU->hasPhysRegDefs)
reschedulePhysRegCopies(SU, false);
reschedulePhysReg(SU, false);
}
}

Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86InstrCompiler.td
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
// These instructions XOR the frame pointer into a GPR. They are used in some
// stack protection schemes. These are post-RA pseudos because we only know the
// frame register after register allocation.
let Constraints = "$src = $dst", isPseudo = 1, Defs = [EFLAGS] in {
let Constraints = "$src = $dst", isMoveImm = 1, isPseudo = 1, Defs = [EFLAGS] in {
def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src),
"xorl\t$$FP, $src", []>,
Requires<[NotLP64]>, Sched<[WriteALU]>;
Expand Down Expand Up @@ -270,7 +270,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>;
// Alias instruction mapping movr0 to xor.
// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
isPseudo = 1, AddedComplexity = 10 in
isPseudo = 1, isMoveImm = 1, AddedComplexity = 10 in
def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
[(set GR32:$dst, 0)]>, Sched<[WriteZero]>;

Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1493,7 +1493,7 @@ def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
"mov{q}\t{$src, $dst|$dst, $src}", []>;
}

let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
[(set GR8:$dst, imm:$src)]>;
Expand All @@ -1507,7 +1507,7 @@ def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, i64immSExt32:$src)]>;
}
let isReMaterializable = 1 in {
let isReMaterializable = 1, isMoveImm = 1 in {
def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
"movabs{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, relocImm:$src)]>;
Expand Down
60 changes: 28 additions & 32 deletions llvm/test/CodeGen/AMDGPU/call-argument-types.ll
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ declare void @external_void_func_v16i8(<16 x i8>) #0

; MESA-DAG: s_mov_b64 s[0:1], s[36:37]

; GCN: v_mov_b32_e32 v0, 1{{$}}
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4
; GCN-DAG: v_mov_b32_e32 v0, 1{{$}}
; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4

; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
; GCN-NEXT: s_endpgm
Expand Down Expand Up @@ -123,12 +123,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm:
; MESA-DAG: s_mov_b32 s33, s3{{$}}

; GCN: v_mov_b32_e32 v0, 0x7b
; HSA-DAG: s_mov_b32 s4, s33{{$}}
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4
; GCN-DAG: v_mov_b32_e32 v0, 0x7b

; HSA-DAG: s_mov_b32 s4, s33{{$}}
; GCN-DAG: s_mov_b32 s32, s33{{$}}

; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
Expand All @@ -144,11 +144,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; MESA-DAG: s_mov_b32 s33, s3{{$}}

; GCN-DAG: buffer_load_sbyte v0
; GCN: s_mov_b32 s4, s33
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4

; GCN-DAG: s_mov_b32 s4, s33
; GCN-DAG: s_mov_b32 s32, s3

; GCN: s_waitcnt vmcnt(0)
Expand All @@ -165,10 +165,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; HSA-DAG: s_mov_b32 s33, s9{{$}}

; GCN-DAG: buffer_load_ubyte v0
; GCN: s_mov_b32 s4, s33
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4

; GCN-DAG: s_mov_b32 s32, s33

Expand Down Expand Up @@ -197,10 +196,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; MESA-DAG: s_mov_b32 s33, s3{{$}}

; GCN-DAG: buffer_load_sshort v0
; GCN: s_mov_b32 s4, s33
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4

; GCN-DAG: s_mov_b32 s32, s33

Expand All @@ -217,11 +215,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; MESA-DAG: s_mov_b32 s33, s3{{$}}


; GCN-DAG: buffer_load_ushort v0
; GCN: s_mov_b32 s4, s33
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4

; GCN-DAG: s_mov_b32 s32, s33

Expand All @@ -237,11 +233,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm:
; MESA-DAG: s_mov_b32 s33, s3{{$}}

; GCN: v_mov_b32_e32 v0, 42
; GCN: s_mov_b32 s4, s33
; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4
; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4
; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4
; GCN-DAG: v_mov_b32_e32 v0, 42
; GCN-DAG: s_mov_b32 s4, s33
; GCN-DAG: s_mov_b32 s32, s33

; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
Expand Down Expand Up @@ -688,7 +684,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; GCN-NOT: s_add_u32 [[SP]]
; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8
; GCN-NEXT: s_swappc_b64
; GCN: s_swappc_b64
; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
; GCN-NOT: s_sub_u32 [[SP]]
Expand Down
27 changes: 13 additions & 14 deletions llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -383,13 +383,12 @@ define void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
; GCN: enable_sgpr_workgroup_id_y = 0
; GCN: enable_sgpr_workgroup_id_z = 0

; GCN-NOT: s6
; GCN-DAG: s_mov_b32 s33, s7
; GCN-DAG: v_mov_b32_e32 v0, 0x22b

; GCN-NOT: s6
; GCN: s_mov_b32 s4, s33
; GCN-NOT: s6
; GCN-DAG: s_mov_b32 s4, s33
; GCN-DAG: s_mov_b32 s32, s33
; GCN-NOT: s6
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
call void @other_arg_use_workgroup_id_x(i32 555)
Expand Down Expand Up @@ -578,16 +577,16 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {

; GCN: s_swappc_b64

; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_X]]
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_X]]
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Y]]
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Y]]
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s[[LO_Z]]
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s[[HI_Z]]
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]]
; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]]
; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}}
; GCN-DAG: v_mov_b32_e32 v[[LO2:[0-9]+]], s[[LO_Y]]
; GCN-DAG: v_mov_b32_e32 v[[HI2:[0-9]+]], s[[HI_Y]]
; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO2]]:[[HI2]]{{\]}}
; GCN-DAG: v_mov_b32_e32 v[[LO3:[0-9]+]], s[[LO_Z]]
; GCN-DAG: v_mov_b32_e32 v[[HI3:[0-9]+]], s[[HI_Z]]
; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO3]]:[[HI3]]{{\]}}
; GCN: ; use
; GCN: ; use [[SAVE_X]]
; GCN: ; use [[SAVE_Y]]
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,13 @@ ret:
; GCN-DAG: s_movk_i32 s6, 0x204

; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
; CI: v_add_i32_e64 v0, s[6:7], s6, [[SCALED]]
; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s[6:7], s6, [[SCALED]]

; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
; GFX9: v_add_u32_e32 v0, s6, [[SCALED]]
; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], s6, [[SCALED]]

; GCN: v_mul_lo_i32 v0, v0, 9
; GCN: ds_write_b32 v0, v0
; GCN: v_mul_lo_i32 [[VZ]], [[VZ]], 9
; GCN: ds_write_b32 v0, [[VZ]]
define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
%alloca0 = alloca [128 x i32], align 4, addrspace(5)
%alloca1 = alloca [8 x i32], align 4, addrspace(5)
Expand All @@ -191,13 +191,13 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x204

; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6
; CI: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]
; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]

; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]]
; GFX9: v_add_u32_e32 v0, [[OFFSET]], [[SCALED]]
; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[OFFSET]], [[SCALED]]

; GCN: v_mul_lo_i32 v0, v0, 9
; GCN: ds_write_b32 v0, v0
; GCN: v_mul_lo_i32 [[VZ]], [[VZ]], 9
; GCN: ds_write_b32 v0, [[VZ]]
define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 {
%alloca0 = alloca [128 x i32], align 4, addrspace(5)
%alloca1 = alloca [8 x i32], align 4, addrspace(5)
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/local-atomics64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) noun
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN: ds_add_u64 {{v[0-9]+}}, v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
Expand Down
Loading

0 comments on commit 1241dcb

Please sign in to comment.