Skip to content

Commit

Permalink
AMDGPU/GlobalISel: Run the localizer pass
Browse files Browse the repository at this point in the history
While looking at the output on real sized programs, there is a lot of
extra SGPR spilling compared to the DAG path. This seems to largely be
from all constants being SGPRs in the entry block.
  • Loading branch information
arsenm committed Feb 17, 2020
1 parent 2178088 commit 5fdc985
Show file tree
Hide file tree
Showing 2 changed files with 214 additions and 0 deletions.
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Expand Up @@ -30,6 +30,7 @@
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/Passes.h"
Expand Down Expand Up @@ -623,6 +624,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
void addPreLegalizeMachineIR() override;
bool addLegalizeMachineIR() override;
bool addRegBankSelect() override;
void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
void addFastRegAlloc() override;
void addOptimizedRegAlloc() override;
Expand Down Expand Up @@ -914,6 +916,12 @@ bool GCNPassConfig::addRegBankSelect() {
return false;
}

void GCNPassConfig::addPreGlobalInstructionSelect() {
// FIXME: We should run this before legalizing globals, but for some reason
// this requires legalized and regbankselected.
addPass(new Localizer());
}

bool GCNPassConfig::addGlobalInstructionSelect() {
addPass(new InstructionSelect());
return false;
Expand Down
206 changes: 206 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -0,0 +1,206 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s

; Test the localizer did something and we don't materialize all
; constants in SGPRs in the entry block.

define amdgpu_kernel void @localize_constants(i1 %cond) {
; GFX9-LABEL: localize_constants:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc0 BB0_2
; GFX9-NEXT: ; %bb.1: ; %bb0
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c8
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e7
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e8
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c7
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: v_mov_b32_e32 v0, 0x5be6
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: BB0_2: ; %bb1
; GFX9-NEXT: v_mov_b32_e32 v0, 0x5be6
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c7
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e8
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1c8
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3e7
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: global_store_dword v[0:1], v0, off
; GFX9-NEXT: s_endpgm
entry:
br i1 %cond, label %bb0, label %bb1

bb0:
store volatile i32 123, i32 addrspace(1)* undef
store volatile i32 456, i32 addrspace(1)* undef
store volatile i32 999, i32 addrspace(1)* undef
store volatile i32 1000, i32 addrspace(1)* undef
store volatile i32 455, i32 addrspace(1)* undef
store volatile i32 23526, i32 addrspace(1)* undef
br label %bb2

bb1:
store volatile i32 23526, i32 addrspace(1)* undef
store volatile i32 455, i32 addrspace(1)* undef
store volatile i32 1000, i32 addrspace(1)* undef
store volatile i32 456, i32 addrspace(1)* undef
store volatile i32 999, i32 addrspace(1)* undef
store volatile i32 123, i32 addrspace(1)* undef
br label %bb2

bb2:
ret void
}

; FIXME: These aren't localized because thesee were legalized before
; the localizer, and are no longer G_GLOBAL_VALUE.
@gv0 = addrspace(1) global i32 undef, align 4
@gv1 = addrspace(1) global i32 undef, align 4
@gv2 = addrspace(1) global i32 undef, align 4
@gv3 = addrspace(1) global i32 undef, align 4

define amdgpu_kernel void @localize_globals(i1 %cond) {
; GFX9-LABEL: localize_globals:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, gv2@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, gv2@gotpcrel32@hi+4
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, gv3@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, gv3@gotpcrel32@hi+4
; GFX9-NEXT: s_getpc_b64 s[8:9]
; GFX9-NEXT: s_add_u32 s8, s8, gv0@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s9, s9, gv0@gotpcrel32@hi+4
; GFX9-NEXT: s_getpc_b64 s[6:7]
; GFX9-NEXT: s_add_u32 s6, s6, gv1@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s7, s7, gv1@gotpcrel32@hi+4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s4, s4, 1
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-NEXT: s_cbranch_scc0 BB1_2
; GFX9-NEXT: ; %bb.1: ; %bb0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v5, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: s_branch BB1_3
; GFX9-NEXT: BB1_2: ; %bb1
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v5, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: BB1_3: ; %bb2
; GFX9-NEXT: global_store_dword v[0:1], v4, off
; GFX9-NEXT: global_store_dword v[2:3], v5, off
; GFX9-NEXT: s_endpgm
entry:
br i1 %cond, label %bb0, label %bb1

bb0:
store volatile i32 0, i32 addrspace(1)* @gv0
store volatile i32 1, i32 addrspace(1)* @gv1
br label %bb2

bb1:
store volatile i32 0, i32 addrspace(1)* @gv2
store volatile i32 1, i32 addrspace(1)* @gv3
br label %bb2

bb2:
ret void
}

@static.gv0 = internal addrspace(1) global i32 undef, align 4
@static.gv1 = internal addrspace(1) global i32 undef, align 4
@static.gv2 = internal addrspace(1) global i32 undef, align 4
@static.gv3 = internal addrspace(1) global i32 undef, align 4

define void @localize_internal_globals(i1 %cond) {
; GFX9-LABEL: localize_internal_globals:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_getpc_b64 s[10:11]
; GFX9-NEXT: s_add_u32 s10, s10, static.gv2@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s11, s11, static.gv2@rel32@hi+4
; GFX9-NEXT: s_getpc_b64 s[8:9]
; GFX9-NEXT: s_add_u32 s8, s8, static.gv3@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s9, s9, static.gv3@rel32@hi+4
; GFX9-NEXT: s_getpc_b64 s[6:7]
; GFX9-NEXT: s_add_u32 s6, s6, static.gv0@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s7, s7, static.gv0@rel32@hi+4
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, static.gv1@rel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, static.gv1@rel32@hi+4
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, 1
; GFX9-NEXT: s_xor_b64 s[12:13], vcc, s[12:13]
; GFX9-NEXT: s_and_saveexec_b64 s[14:15], s[12:13]
; GFX9-NEXT: s_xor_b64 s[12:13], exec, s[14:15]
; GFX9-NEXT: s_cbranch_execnz BB2_2
; GFX9-NEXT: ; %bb.1: ; %bb1
; GFX9-NEXT: v_mov_b32_e32 v0, s10
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s11
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v2, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: BB2_2: ; %Flow
; GFX9-NEXT: s_or_saveexec_b64 s[8:9], s[12:13]
; GFX9-NEXT: s_xor_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_cbranch_execz BB2_4
; GFX9-NEXT: ; %bb.3: ; %bb0
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v2, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: BB2_4: ; %bb2
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
entry:
br i1 %cond, label %bb0, label %bb1

bb0:
store volatile i32 0, i32 addrspace(1)* @static.gv0
store volatile i32 1, i32 addrspace(1)* @static.gv1
br label %bb2

bb1:
store volatile i32 0, i32 addrspace(1)* @static.gv2
store volatile i32 1, i32 addrspace(1)* @static.gv3
br label %bb2

bb2:
ret void
}

0 comments on commit 5fdc985

Please sign in to comment.