Skip to content

Commit

Permalink
[AMDGPU] Reserve extra SGPR blocks wth XNACK "any" TID Setting
Browse files Browse the repository at this point in the history
ASMPrinter was relying on feature bits to setup extra SGRPs in the knerel
descriptor for the xnack_mask. This was broken for the dynamic XNACK "any" TID
setting which could cause user SGPRs to be clobbered if the number of SGPRs
reserved was near a granulated block boundary.

When XNACK was enabled this worked correctly in the ASMParser which meant some
kernels were only failing without "-save-temps".

Fixes: SWDEV-382764

Reviewed By: kzhuravl

Differential Revision: https://reviews.llvm.org/D145401
  • Loading branch information
kerbowa committed Mar 18, 2023
1 parent b5c862e commit 864a2b2
Show file tree
Hide file tree
Showing 10 changed files with 318 additions and 32 deletions.
2 changes: 1 addition & 1 deletion clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
Expand Up @@ -2,7 +2,7 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null

// expected-remark@+9 {{Function Name: foo}}
// expected-remark@+8 {{ SGPRs: 9}}
// expected-remark@+8 {{ SGPRs: 13}}
// expected-remark@+7 {{ VGPRs: 10}}
// expected-remark@+6 {{ AGPRs: 12}}
// expected-remark@+5 {{ ScratchSize [bytes/lane]: 0}}
Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Expand Up @@ -251,9 +251,9 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
CurrentProgramInfo.NumVGPRsForWavesPerEU,
CurrentProgramInfo.NumSGPRsForWavesPerEU -
IsaInfo::getNumExtraSGPRs(&STM,
CurrentProgramInfo.VCCUsed,
CurrentProgramInfo.FlatUsed),
IsaInfo::getNumExtraSGPRs(
&STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
CodeObjectVersion);

Expand Down Expand Up @@ -721,7 +721,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
&STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed);
&STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
getTargetStreamer()->getTargetID()->isXnackOnOrAny());

// Check the addressable register limit before we add ExtraSGPRs.
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
Expand Up @@ -3061,7 +3061,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 12
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 9
; GPRIDX-NEXT: wavefront_sgpr_count = 13
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
Expand Down Expand Up @@ -3913,7 +3913,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 0
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
Expand Down Expand Up @@ -3956,7 +3956,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 12
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 6
; GPRIDX-NEXT: wavefront_sgpr_count = 10
; GPRIDX-NEXT: workitem_vgpr_count = 2
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
Expand Down Expand Up @@ -4259,7 +4259,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 0
; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
Expand Down Expand Up @@ -4302,7 +4302,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 12
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
; GPRIDX-NEXT: wavefront_sgpr_count = 7
; GPRIDX-NEXT: wavefront_sgpr_count = 11
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -1,6 +1,6 @@
; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX8 -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s
; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdpal -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX8 -enable-var-scope %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG,GFX9 -enable-var-scope %s
; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mattr=-xnack -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL,GFX9 -enable-var-scope %s

declare amdgpu_gfx float @extern_func(float) #0
declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -1,6 +1,6 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GCN,GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1010,GFX1010W32 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1010,GFX1010W64 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack < %s | FileCheck --check-prefixes=GCN,GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1010,GFX1010W32 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-xnack -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1010,GFX1010W64 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W32,GFX1030,GFX1030W32 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1030,GFX1030W64 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
Expand Up @@ -2,7 +2,7 @@
; RUN: FileCheck -check-prefix=REMARK %s < %t

; STDERR: remark: foo.cl:27:0: Function Name: test_kernel
; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: 24
; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: 28
; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: 9
; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: 43
; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: 0
Expand All @@ -27,7 +27,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' SGPRs: '
; REMARK-NEXT: - NumSGPR: '24'
; REMARK-NEXT: - NumSGPR: '28'
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
Expand Down Expand Up @@ -120,7 +120,7 @@ define void @test_func() !dbg !6 {
}

; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: 4
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0
Expand Down
27 changes: 27 additions & 0 deletions llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -0,0 +1,27 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefixes=ASM %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s

; TODO: Update to check for granulated sgpr count directive once one is added.

define amdgpu_kernel void @kern() {
; ASM-LABEL: kern:
; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_reserve_xnack_mask 1

; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
; OBJ: Contents of section .rodata:
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @...............

; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
entry:
tail call void asm sideeffect "", "~{s[0:4]}"()
ret void
}

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
27 changes: 27 additions & 0 deletions llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -0,0 +1,27 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack < %s | FileCheck --check-prefixes=ASM %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s

; TODO: Update to check for granulated sgpr count directive once one is added.

define amdgpu_kernel void @kern() {
; ASM-LABEL: kern:
; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_reserve_xnack_mask 0

; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
; OBJ: Contents of section .rodata:
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000 ................

; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 5
entry:
tail call void asm sideeffect "", "~{s[0:4]}"()
ret void
}

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
27 changes: 27 additions & 0 deletions llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -0,0 +1,27 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack < %s | FileCheck --check-prefixes=ASM %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s

; TODO: Update to check for granulated sgpr count directive once one is added.

define amdgpu_kernel void @kern() {
; ASM-LABEL: kern:
; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_reserve_xnack_mask 1

; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
; OBJ: Contents of section .rodata:
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @...............

; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
entry:
tail call void asm sideeffect "", "~{s[0:4]}"()
ret void
}

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}

0 comments on commit 864a2b2

Please sign in to comment.