Skip to content

Commit

Permalink
[AMDGPU] Annotate vgpr<->agpr spills in asm
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D92125
  • Loading branch information
rampitec committed Dec 7, 2020
1 parent bf8683a commit dd89249
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 53 deletions.
14 changes: 10 additions & 4 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Expand Up @@ -697,8 +697,10 @@ static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
: AMDGPU::V_ACCVGPR_READ_B32;

return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
.addReg(Src, getKillRegState(IsKill));
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
.addReg(Src, getKillRegState(IsKill));
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
return MIB;
}

// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
Expand Down Expand Up @@ -871,10 +873,12 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
RS->setRegUsed(TmpReg);
}
if (IsStore) {
auto AccRead = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
auto AccRead = BuildMI(*MBB, MI, DL,
TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
.addReg(SubReg, getKillRegState(IsKill));
if (NeedSuperRegDef)
AccRead.addReg(ValueReg, RegState::ImplicitDefine);
AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
}
SubReg = TmpReg;
}
Expand Down Expand Up @@ -908,10 +912,12 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
if (!IsAGPR && NeedSuperRegDef)
MIB.addReg(ValueReg, RegState::ImplicitDefine);

if (!IsStore && TmpReg != AMDGPU::NoRegister)
if (!IsStore && TmpReg != AMDGPU::NoRegister) {
MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
FinalReg)
.addReg(TmpReg, RegState::Kill);
MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
}
} else {
if (NeedSuperRegDef)
MIB.addReg(ValueReg, RegState::ImplicitDefine);
Expand Down
40 changes: 17 additions & 23 deletions llvm/test/CodeGen/AMDGPU/spill-agpr.ll
Expand Up @@ -5,10 +5,10 @@
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; A2V-NOT: SCRATCH_RSRC
; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
; A2V: ScratchSize: 0
define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 {
bb:
Expand All @@ -34,10 +34,10 @@ bb:
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; A2V-NOT: SCRATCH_RSRC
; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse
; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
; A2V: ScratchSize: 0
define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
bb:
Expand All @@ -55,8 +55,7 @@ use:
st:
%gep1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 16
%gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 32
store <4 x float> %mai.1, <4 x float> addrspace(1)* %gep1
store <4 x float> %mai.2, <4 x float> addrspace(1)* %gep2
call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2)
ret void
}

Expand All @@ -65,36 +64,31 @@ st:
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; A2V-NOT: SCRATCH_RSRC

; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse
; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
; A2V: ScratchSize: 0

; A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
; A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
; A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]]
; A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] ; Reload Reuse
define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
%v0 = load volatile i32, i32 addrspace(3)* undef
%v1 = load volatile i32, i32 addrspace(3)* undef
%v2 = load volatile i32, i32 addrspace(3)* undef
%v3 = load volatile i32, i32 addrspace(3)* undef
%v4 = load volatile i32, i32 addrspace(3)* undef
%v5 = load volatile i32, i32 addrspace(3)* undef
%v6 = load volatile i32, i32 addrspace(3)* undef
%v7 = load volatile i32, i32 addrspace(3)* undef
call void asm sideeffect "", "a,a,a,a,~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6}"(i32 %v0, i32 %v1, i32 %v2, i32 %v3)
%v8 = load volatile i32, i32 addrspace(3)* undef
call void asm sideeffect "", "a,a,a,a,a"(i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8)
%a1 = call <4 x i32> asm sideeffect "", "=a"()
%a2 = call <4 x i32> asm sideeffect "", "=a"()
%a3 = call i32 asm sideeffect "", "=a"()
%a4 = call <2 x i32> asm sideeffect "", "=a"()
call void asm sideeffect "", "a,a,a"(<4 x i32> %a1, <4 x i32> %a2, i32 %a3)
call void asm sideeffect "", "a"(<2 x i32> %a4)
ret void
}

; GCN-LABEL: {{^}}max_32regs_mfma32:
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; A2V-NOT: SCRATCH_RSRC
; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse
; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload
; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse
; A2V: ScratchSize: 0
define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 {
bb:
Expand All @@ -115,6 +109,6 @@ declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i3
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)

attributes #0 = { nounwind "amdgpu-num-vgpr"="24" }
attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
attributes #1 = { nounwind "amdgpu-num-vgpr"="10" }
attributes #2 = { nounwind "amdgpu-num-vgpr"="12" }
attributes #3 = { nounwind "amdgpu-num-vgpr"="32" }
52 changes: 26 additions & 26 deletions llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
Expand Up @@ -5,15 +5,15 @@
; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GFX908-NOT: SCRATCH_RSRC
; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse
; GFX900: buffer_store_dword v{{[0-9]}},
; GFX900: buffer_store_dword v{{[0-9]}},
; GFX900: buffer_load_dword v{{[0-9]}},
; GFX900: buffer_load_dword v{{[0-9]}},
; GFX908-NOT: buffer_
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a0
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a0 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse

; GCN: NumVgprs: 10
; GFX900: ScratchSize: 12
Expand Down Expand Up @@ -59,10 +59,10 @@ define amdgpu_kernel void @max_10_vgprs(i32 addrspace(1)* %p) #0 {
; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GFX908-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
; GFX908: buffer_store_dword v{{[0-9]}},
; GFX908-NOT: buffer_
; GFX908: v_accvgpr_read_b32 v{{[0-9]}}, a9
; GFX908: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse
; GFX908: buffer_load_dword v{{[0-9]}},
; GFX908-NOT: buffer_

Expand Down Expand Up @@ -113,28 +113,28 @@ define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 {
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GFX908-DAG: v_accvgpr_write_b32 a0, 1
; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
; GFX900: buffer_store_dword v{{[0-9]}},
; GCN-DAG: buffer_store_dword v{{[0-9]}},
; GFX900: buffer_load_dword v{{[0-9]}},
; GCN-DAG: buffer_load_dword v{{[0-9]}},
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8 ; Reload Reuse
; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9 ; Reload Reuse

; GCN: NumVgprs: 10
; GFX900: ScratchSize: 44
Expand Down Expand Up @@ -166,8 +166,8 @@ define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)*
; GCN-LABEL: {{^}}max_10_vgprs_spill_v32:
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GFX908-DAG: v_accvgpr_write_b32 a0,
; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} ; Reload Reuse
; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} ; Reload Reuse
; GCN-NOT: a10
; GCN: buffer_store_dword v{{[0-9]}},

Expand Down

0 comments on commit dd89249

Please sign in to comment.