Skip to content

Commit

Permalink
[AMDGPU] Introduce command line switch to control super aligning of LDS.
Browse files Browse the repository at this point in the history
Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D103817
  • Loading branch information
hs-mahesha committed Jun 7, 2021
1 parent 3af5f3e commit 713ca2f
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 23 deletions.
48 changes: 25 additions & 23 deletions llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
Expand Up @@ -37,6 +37,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <algorithm>
Expand All @@ -46,6 +47,11 @@

using namespace llvm;

static cl::opt<bool> SuperAlignLDSGlobals(
"amdgpu-super-align-lds-globals",
cl::desc("Increase alignment of LDS if it is not on align boundary"),
cl::init(true), cl::Hidden);

namespace {

class AMDGPULowerModuleLDS : public ModulePass {
Expand Down Expand Up @@ -174,31 +180,27 @@ class AMDGPULowerModuleLDS : public ModulePass {

// Increase the alignment of LDS globals if necessary to maximise the chance
// that we can use aligned LDS instructions to access them.
for (auto *GV : FoundLocalVars) {
unsigned AlignValue = GV->getAlignment();
if (AlignValue == 0) {
GV->setAlignment(DL.getABITypeAlign(GV->getValueType()));
continue;
}
if (SuperAlignLDSGlobals) {
for (auto *GV : FoundLocalVars) {
Align Alignment = AMDGPU::getAlign(DL, GV);
TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());

if (GVSize > 8) {
// We might want to use a b96 or b128 load/store
Alignment = std::max(Alignment, Align(16));
} else if (GVSize > 4) {
// We might want to use a b64 load/store
Alignment = std::max(Alignment, Align(8));
} else if (GVSize > 2) {
// We might want to use a b32 load/store
Alignment = std::max(Alignment, Align(4));
} else if (GVSize > 1) {
// We might want to use a b16 load/store
Alignment = std::max(Alignment, Align(2));
}

Align Alignment(AlignValue);
TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());

if (GVSize > 8) {
// We might want to use a b96 or b128 load/store
Alignment = std::max(Alignment, Align(16));
} else if (GVSize > 4) {
// We might want to use a b64 load/store
Alignment = std::max(Alignment, Align(8));
} else if (GVSize > 2) {
// We might want to use a b32 load/store
Alignment = std::max(Alignment, Align(4));
} else if (GVSize > 1) {
// We might want to use a b16 load/store
Alignment = std::max(Alignment, Align(2));
GV->setAlignment(Alignment);
}

GV->setAlignment(Alignment);
}

// Sort by alignment, descending, to minimise padding.
Expand Down
23 changes: 23 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
@@ -0,0 +1,23 @@
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck --check-prefix=SUPER-ALIGN_ON %s
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck --check-prefix=SUPER-ALIGN_ON %s
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefix=SUPER-ALIGN_OFF %s
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefix=SUPER-ALIGN_OFF %s

; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [32 x i8] }

; CHECK-NOT: @lds.1
@lds.1 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1

; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 16
; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 1

; CHECK-LABEL: @k4
; CHECK: %ptr = getelementptr inbounds i8, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k4.lds.t, %llvm.amdgcn.kernel.k4.lds.t addrspace(3)* @llvm.amdgcn.kernel.k4.lds, i32 0, i32
; CHECK: 0, i32 0) to i8*), i64 %x
; CHECK: store i8 1, i8* %ptr, align 1
; CHECK: ret void
define amdgpu_kernel void @k4(i64 %x) {
%ptr = getelementptr inbounds i8, i8* addrspacecast ([32 x i8] addrspace(3)* @lds.1 to i8*), i64 %x
store i8 1, i8 addrspace(0)* %ptr, align 1
ret void
}

0 comments on commit 713ca2f

Please sign in to comment.