Skip to content

Commit

Permalink
AMDGPU: Fix computation for getOccupancyWithLocalMemSize
Browse files Browse the repository at this point in the history
The computation here didn't really make sense to me, and reported
wildy different results depending on the flat work group size
attribute.

I think this should really report a range derived from the possible
work group size bounds, and only allow an occupancy that is a multiple
of the group size.
  • Loading branch information
arsenm committed Mar 3, 2020
1 parent 27a3ece commit 88aced1
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 9 deletions.
41 changes: 32 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Expand Up @@ -328,18 +328,41 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
}

// FIXME: Should return min,max range.
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
const Function &F) const {
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
if (!WorkGroupsPerCu)
const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
if (!MaxWorkGroupsPerCu)
return 0;
unsigned MaxWaves = getMaxWavesPerEU();
unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
NumWaves = std::min(NumWaves, MaxWaves);
NumWaves = std::max(NumWaves, 1u);
return NumWaves;

const unsigned WaveSize = getWavefrontSize();

// FIXME: Do we need to account for alignment requirement of LDS rounding the
// size up?
// Compute restriction based on LDS usage
unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);

// This can be queried with more LDS than is possible, so just assume the
// worst.
if (NumGroups == 0)
return 1;

NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);

// Round to the number of waves.
const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
unsigned MaxWaves = NumGroups * MaxGroupNumWaves;

// Clamp to the maximum possible number of waves.
MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());

// FIXME: Needs to be a multiple of the group size?
//MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);

assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
"computed invalid occupancy");
return MaxWaves;
}

unsigned
Expand Down
89 changes: 89 additions & 0 deletions llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
Expand Up @@ -283,6 +283,95 @@ define amdgpu_kernel void @used_lds_13112() {
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_64:
; GFX9: ; Occupancy: 7{{$}}
; GFX101064: ; Occupancy: 7{{$}}
; GFX1010W32: ; Occupancy: 14{{$}}
@lds8252 = internal addrspace(3) global [8252 x i8] undef, align 4
define amdgpu_kernel void @used_lds_8252_max_group_size_64() #3 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_96:
; GFX9: ; Occupancy: 10{{$}}
; GFX1010W64: ; Occupancy: 14{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_96() #4 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_128:
; GFX9: ; Occupancy: 10{{$}}
; GFX1010W64: ; Occupancy: 14{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_128() #5 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_192:
; GFX9: ; Occupancy: 10{{$}}
; GFX1010W64: ; Occupancy: 20{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_192() #6 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_256:
; GFX9: ; Occupancy: 10{{$}}
; GFX1010W64: ; Occupancy: 20{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_256() #7 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_512:
; GFX9: ; Occupancy: 10{{$}}
; GFX1010W64: ; Occupancy: 20{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_512() #8 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_1024:
; GFX9: ; Occupancy: 10{{$}}
; GFX1010W64: ; Occupancy: 20{{$}}
; GFX1010W32: ; Occupancy: 20{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_1024() #9 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
ret void
}

; GCN-LABEL: {{^}}used_lds_8252_max_group_size_32:
; GFX9: ; Occupancy: 7{{$}}
; GFX1010W64: ; Occupancy: 7{{$}}
; GFX1010W32: ; Occupancy: 7{{$}}
define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
%p = bitcast [8252 x i8] addrspace(3)* @lds8252 to i8 addrspace(3)*
store volatile i8 1, i8 addrspace(3)* %p
ret void
}

attributes #0 = { "amdgpu-waves-per-eu"="2,3" }
attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }
attributes #4 = { "amdgpu-flat-work-group-size"="1,96" }
attributes #5 = { "amdgpu-flat-work-group-size"="1,128" }
attributes #6 = { "amdgpu-flat-work-group-size"="1,192" }
attributes #7 = { "amdgpu-flat-work-group-size"="1,256" }
attributes #8 = { "amdgpu-flat-work-group-size"="1,512" }
attributes #9 = { "amdgpu-flat-work-group-size"="1,1024" }
attributes #10 = { "amdgpu-flat-work-group-size"="1,32" }

0 comments on commit 88aced1

Please sign in to comment.