Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 22 additions & 16 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,23 +190,29 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
// sizes limits the achievable maximum, and we aim to support enough waves per
// EU so that we can concurrently execute all waves of a single workgroup of
// maximum size on a CU.
std::pair<unsigned, unsigned> Default = {
std::pair<unsigned, unsigned> WavesPerEU = {
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),
getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
Default.first = std::min(Default.first, Default.second);

// Make sure requested minimum is within the default range and lower than the
// requested maximum. The latter must not violate target specification.
if (RequestedWavesPerEU.first < Default.first ||
RequestedWavesPerEU.first > Default.second ||
RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
RequestedWavesPerEU.second > getMaxWavesPerEU())
return Default;

// We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
RequestedWavesPerEU.second =
std::min(RequestedWavesPerEU.second, Default.second);
return RequestedWavesPerEU;
WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);

// Requested minimum must not violate subtarget's specifications and be no
// greater than maximum.
if (RequestedWavesPerEU.first &&
(RequestedWavesPerEU.first < getMinWavesPerEU() ||
RequestedWavesPerEU.first > RequestedWavesPerEU.second))
return WavesPerEU;
// Requested maximum must not violate subtarget's specifications.
if (RequestedWavesPerEU.second > getMaxWavesPerEU())
return WavesPerEU;

// A requested maximum may limit both the final minimum and maximum, but
// not increase them. A requested minimum can either decrease or increase the
// default minimum.
WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second);
if (RequestedWavesPerEU.first)
WavesPerEU.first = RequestedWavesPerEU.first;
WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
return WavesPerEU;
}

std::pair<unsigned, unsigned>
Expand All @@ -225,7 +231,7 @@ std::pair<unsigned, unsigned>
AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
unsigned LDSBytes, const Function &F) const {
// Default minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
std::pair<unsigned, unsigned> Default(0, getMaxWavesPerEU());

// Requested minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Requested =
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,9 @@ class AMDGPUSubtarget {
/// Returns the target minimum/maximum number of waves per EU. This is based
/// on the minimum/maximum number of \p RequestedWavesPerEU and further
/// limited by the maximum achievable occupancy derived from the range of \p
/// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
/// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A minimum
/// requested waves/EU value of 0 indicates an intent to not restrict the
/// minimum target occupancy.
std::pair<unsigned, unsigned>
getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
std::pair<unsigned, unsigned> FlatWorkGroupSizes,
Expand Down
12 changes: 12 additions & 0 deletions llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,15 @@ entry:
ret void
}
attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"}

; At most 2 waves per execution unit.
; CHECK-LABEL: {{^}}empty_at_most_2:
; CHECK: SGPRBlocks: 12
; CHECK: VGPRBlocks: 21
; CHECK: NumSGPRsForWavesPerEU: 102
; CHECK: NumVGPRsForWavesPerEU: 85
define amdgpu_kernel void @empty_at_most_2() #13 {
entry:
ret void
}
attributes #13 = {"amdgpu-waves-per-eu"="0,2"}

This file was deleted.

25 changes: 13 additions & 12 deletions llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s

; Check propagation of amdgpu-flat-work-group-size attribute.
; Check propagation of amdgpu-waves-per-eu attribute.

; Called from a single kernel with 1,8
define internal void @default_to_1_8_a() {
Expand Down Expand Up @@ -216,41 +216,42 @@ define internal i32 @bitcasted_function() {
ret i32 0
}

define internal void @called_from_invalid_bounds_0() {
; CHECK-LABEL: define internal void @called_from_invalid_bounds_0
define internal void @called_without_min_waves() {
; CHECK-LABEL: define internal void @called_without_min_waves
; CHECK-SAME: () #[[ATTR1]] {
; CHECK-NEXT: ret void
;
ret void
}

define internal void @called_from_invalid_bounds_1() {
; CHECK-LABEL: define internal void @called_from_invalid_bounds_1
define internal void @called_from_invalid_bounds() {
; CHECK-LABEL: define internal void @called_from_invalid_bounds
; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
; CHECK-NEXT: ret void
;
ret void
}

; Invalid range for amdgpu-waves-per-eu
define amdgpu_kernel void @kernel_invalid_bounds_0_8() #9 {
; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_0_8
; Kernel does not specify a minimum desired occupancy so will end up with the
; default minimum (and the requested maximum).
define amdgpu_kernel void @kernel_without_min_waves() #9 {
; CHECK-LABEL: define amdgpu_kernel void @kernel_without_min_waves
; CHECK-SAME: () #[[ATTR1]] {
; CHECK-NEXT: call void @called_from_invalid_bounds_0()
; CHECK-NEXT: call void @called_without_min_waves()
; CHECK-NEXT: ret void
;
call void @called_from_invalid_bounds_0()
call void @called_without_min_waves()
ret void
}

; Invalid range for amdgpu-waves-per-eu
define amdgpu_kernel void @kernel_invalid_bounds_1_123() #10 {
; CHECK-LABEL: define amdgpu_kernel void @kernel_invalid_bounds_1_123
; CHECK-SAME: () #[[ATTR11:[0-9]+]] {
; CHECK-NEXT: call void @called_from_invalid_bounds_1()
; CHECK-NEXT: call void @called_from_invalid_bounds()
; CHECK-NEXT: ret void
;
call void @called_from_invalid_bounds_1()
call void @called_from_invalid_bounds()
ret void
}

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/vgpr-limit-gfx1250.ll
Original file line number Diff line number Diff line change
Expand Up @@ -575,5 +575,5 @@ define amdgpu_kernel void @use1024vgprs_codegen(ptr %p) #1281 {
ret void
}

attributes #2561 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="1" }
attributes #1281 = { nounwind "amdgpu-flat-work-group-size"="128,128" "amdgpu-waves-per-eu"="1" }
attributes #2561 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
attributes #1281 = { nounwind "amdgpu-flat-work-group-size"="128,128" }
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s

; All kernels have the same value for amdgpu-flat-work-group-size, except the
; second and third kernels explicitly set it. The first and second kernel also
; have the same final waves/EU range, except the second kernel explicitly sets
; it with amdgpu-waves-per-eu. As a result the first and second kernels are
; treated identically.
;
; The third kernel hints the compiler that a maximum occupancy of 1 is desired
; with amdgpu-waves-per-eu, so the alloca promotion pass is free to use more LDS
; space than when limiting itself to support the maximum default occupancy of
; 10. This does not break the ABI requirement to support the full possible range
; of workgroup sizes as specified by amdgpu-flat-work-group-size.

; CHECK-NOT: @no_attributes.stack
; CHECK-NOT: @explicit_default_workgroup_size_and_waves.stack

; CHECK-LABEL: @no_attributes(
; CHECK: alloca [5 x i32]
; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
define amdgpu_kernel void @no_attributes(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
%0 = load i32, ptr addrspace(1) %in, align 4
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
store i32 4, ptr addrspace(5) %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
%1 = load i32, ptr addrspace(1) %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
store i32 5, ptr addrspace(5) %arrayidx3, align 4
%2 = load i32, ptr addrspace(5) %stack, align 4
store i32 %2, ptr addrspace(1) %out, align 4
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
%3 = load i32, ptr addrspace(5) %arrayidx12
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
store i32 %3, ptr addrspace(1) %arrayidx13
ret void
}

; CHECK-LABEL: @explicit_default_workgroup_size_and_waves(
; CHECK: alloca [5 x i32]
; CHECK: store i32 4, ptr addrspace(5) %arrayidx1, align 4
define amdgpu_kernel void @explicit_default_workgroup_size_and_waves(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
%0 = load i32, ptr addrspace(1) %in, align 4
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
store i32 4, ptr addrspace(5) %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
%1 = load i32, ptr addrspace(1) %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
store i32 5, ptr addrspace(5) %arrayidx3, align 4
%2 = load i32, ptr addrspace(5) %stack, align 4
store i32 %2, ptr addrspace(1) %out, align 4
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
%3 = load i32, ptr addrspace(5) %arrayidx12
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
store i32 %3, ptr addrspace(1) %arrayidx13
ret void
}

; CHECK-LABEL: @explicit_low_occupancy_requested(
; CHECK-NOT: alloca [5 x i32]
define amdgpu_kernel void @explicit_low_occupancy_requested(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
%0 = load i32, ptr addrspace(1) %in, align 4
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
store i32 4, ptr addrspace(5) %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
%1 = load i32, ptr addrspace(1) %arrayidx2, align 4
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
store i32 5, ptr addrspace(5) %arrayidx3, align 4
%2 = load i32, ptr addrspace(5) %stack, align 4
store i32 %2, ptr addrspace(1) %out, align 4
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
%3 = load i32, ptr addrspace(5) %arrayidx12
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
store i32 %3, ptr addrspace(1) %arrayidx13
ret void
}

attributes #0 = { "amdgpu-waves-per-eu"="4,10" "amdgpu-flat-work-group-size"="1,1024" }
attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
Loading