Skip to content

Commit

Permalink
AMDGPU: Relax restriction on folding immediates into physregs
Browse files Browse the repository at this point in the history
I never completed the work on the patches referenced by
f8bf7d7, but this was intended to
avoid folding immediate writes into m0 which the coalescer doesn't
understand very well. Relax this to allow simple SGPR immediates to
fold directly into VGPR copies. This pattern shows up routinely in
current GlobalISel code since nothing is smart enough to emit VGPR
constants yet.
  • Loading branch information
arsenm committed Jul 29, 2020
1 parent f05308a commit 766cb61
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 50 deletions.
53 changes: 25 additions & 28 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Expand Up @@ -643,38 +643,35 @@ void SIFoldOperands::foldOperand(

if (FoldingImmLike && UseMI->isCopy()) {
Register DestReg = UseMI->getOperand(0).getReg();
Register SrcReg = UseMI->getOperand(1).getReg();
assert(SrcReg.isVirtual());

const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);

// Don't fold into a copy to a physical register. Doing so would interfere
// with the register coalescer's logic which would avoid redundant
// initalizations.
if (DestReg.isPhysical())
// Don't fold into a copy to a physical register with the same class. Doing
// so would interfere with the register coalescer's logic which would avoid
// redundant initalizations.
if (DestReg.isPhysical() && SrcRC->contains(DestReg))
return;

const TargetRegisterClass *DestRC = MRI->getRegClass(DestReg);
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
MachineRegisterInfo::use_iterator NextUse;
SmallVector<FoldCandidate, 4> CopyUses;
for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(DestReg),
E = MRI->use_end();
Use != E; Use = NextUse) {
NextUse = std::next(Use);
// There's no point trying to fold into an implicit operand.
if (Use->isImplicit())
continue;

Register SrcReg = UseMI->getOperand(1).getReg();
if (SrcReg.isVirtual()) { // XXX - This can be an assert?
const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
MachineRegisterInfo::use_iterator NextUse;
SmallVector<FoldCandidate, 4> CopyUses;
for (MachineRegisterInfo::use_iterator
Use = MRI->use_begin(DestReg), E = MRI->use_end();
Use != E; Use = NextUse) {
NextUse = std::next(Use);

// There's no point trying to fold into an implicit operand.
if (Use->isImplicit())
continue;

FoldCandidate FC = FoldCandidate(Use->getParent(),
Use.getOperandNo(), &UseMI->getOperand(1));
CopyUses.push_back(FC);
}
for (auto & F : CopyUses) {
foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo,
FoldList, CopiesToReplace);
}
FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(),
&UseMI->getOperand(1));
CopyUses.push_back(FC);
}
for (auto &F : CopyUses) {
foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList, CopiesToReplace);
}
}

Expand Down
Expand Up @@ -115,10 +115,8 @@ define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs_r
}

; ALL-LABEL: {{^}}func_kernarg_segment_ptr:
; ALL: s_mov_b32 [[S_LO:s[0-9]+]], 0{{$}}
; ALL: s_mov_b32 [[S_HI:s[0-9]+]], 0{{$}}
; ALL: v_mov_b32_e32 v0, [[S_LO]]{{$}}
; ALL: v_mov_b32_e32 v1, [[S_HI]]{{$}}
; ALL: v_mov_b32_e32 v0, 0{{$}}
; ALL: v_mov_b32_e32 v1, 0{{$}}
define i8 addrspace(4)* @func_kernarg_segment_ptr() {
%ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
ret i8 addrspace(4)* %ptr
Expand Down
27 changes: 9 additions & 18 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
Expand Up @@ -139,19 +139,17 @@ define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: zextload_global_i32_to_i96:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_mov_b32 s4, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -162,9 +160,8 @@ define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) {
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b32 s4, 0
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr
Expand All @@ -177,23 +174,19 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s5, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: zextload_global_i32_to_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_mov_b32 s4, 0
; GFX8-NEXT: s_mov_b32 s5, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -204,11 +197,9 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) {
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b32 s4, 0
; GFX6-NEXT: s_mov_b32 s5, 0
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: v_mov_b32_e32 v3, 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr
Expand Down
28 changes: 28 additions & 0 deletions llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
Expand Up @@ -59,3 +59,31 @@ body: |
S_ENDPGM 0, implicit %1, implicit %2
...

# GCN-LABEL: name: no_fold_imm_into_m0{{$}}
# GCN: %0:sreg_32 = S_MOV_B32 -8
# GCN-NEXT: $m0 = COPY %0

---
name: no_fold_imm_into_m0
tracksRegLiveness: true
body: |
bb.0:
%0:sreg_32 = S_MOV_B32 -8
$m0 = COPY %0
S_ENDPGM 0, implicit $m0
...

# GCN-LABEL: name: fold_sgpr_imm_to_vgpr_copy{{$}}
# GCN: $vgpr0 = V_MOV_B32_e32 -8, implicit $exec
---
name: fold_sgpr_imm_to_vgpr_copy
tracksRegLiveness: true
body: |
bb.0:
%0:sreg_32 = S_MOV_B32 -8
$vgpr0 = COPY %0
S_ENDPGM 0, implicit $vgpr0
...

0 comments on commit 766cb61

Please sign in to comment.