Skip to content

Commit

Permalink
[AMDGPU] Fix module LDS selection
Browse files Browse the repository at this point in the history
Accesses to global module LDS variable start from null,
but kernel also thinks its variables start address is
null. Fixed by not using a null as an address.

Differential Revision: https://reviews.llvm.org/D102882
  • Loading branch information
rampitec committed May 20, 2021
1 parent b114055 commit 748db5b
Show file tree
Hide file tree
Showing 9 changed files with 65 additions and 18 deletions.
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Expand Up @@ -1305,7 +1305,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,

if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction()) {
if (!MFI->isModuleEntryFunction() &&
!GV->getName().equals("llvm.amdgcn.module.lds")) {
SDLoc DL(Op);
const Function &Fn = DAG.getMachineFunction().getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Expand Up @@ -2286,7 +2286,8 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction()) {
if (!MFI->isModuleEntryFunction() &&
!GV->getName().equals("llvm.amdgcn.module.lds")) {
const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
Expand Down
4 changes: 1 addition & 3 deletions llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
Expand Up @@ -212,8 +212,6 @@ class AMDGPULowerModuleLDS : public ModulePass {

Align MaxAlign =
AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment
Constant *InstanceAddress = Constant::getIntegerValue(
PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0));

GlobalVariable *SGV = new GlobalVariable(
M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
Expand All @@ -236,7 +234,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
GlobalVariable *GV = LocalVars[I];
Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
GV->replaceAllUsesWith(
ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx));
ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx));
GV->eraseFromParent();
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
Expand Up @@ -64,7 +64,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,

void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
if (isModuleEntryFunction()) {
GlobalVariable *GV = M->getGlobalVariable("llvm.amdgcn.module.lds");
const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
if (GV) {
unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
(void)Offset;
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
Expand Up @@ -19,15 +19,15 @@
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4

; CHECK-LABEL: @get_func()
; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
define i32 @get_func() local_unnamed_addr #0 {
entry:
%0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
ret i32 %0
}

; CHECK-LABEL: @set_func(i32 %x)
; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
define void @set_func(i32 %x) local_unnamed_addr #1 {
entry:
store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
Expand All @@ -36,9 +36,9 @@ entry:

; CHECK-LABEL: @timestwo()
; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
; CHECK: %mul = mul i32 %ld, 2
; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
define amdgpu_kernel void @timestwo() {
%ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
%mul = mul i32 %ld, 2
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
Expand Up @@ -3,9 +3,9 @@

; CHECK: %llvm.amdgcn.module.lds.t = type { double, float }

; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to float*), align 8
; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to float*), align 8

; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* null to double*), align 8
; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to double*), align 8

; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8

Expand Down
47 changes: 47 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@@ -0,0 +1,47 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck -check-prefix=OPT %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

; Check that module LDS is allocated at address 0 and kernel starts its
; allocation past module LDS.

@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16

; GCN-LABEL: {{^}}k0:
; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
; GCN: ds_write_b8 [[NULL]], [[ONE]]
; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
; GCN: ds_write_b8 [[NULL]], [[TWO]] offset:16
define amdgpu_kernel void @k0() {
; OPT-LABEL: @k0(
; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ]
; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
; OPT-NEXT: ret void
;
%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
ret void
}

; GCN-LABEL: {{^}}f0:
; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
; GCN: ds_write_b8 [[NULL]], [[TREE]]
define void @f0() {
; OPT-LABEL: @f0(
; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
; OPT-NEXT: store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
; OPT-NEXT: ret void
;
%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
ret void
}
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
Expand Up @@ -29,7 +29,7 @@
@llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"

; CHECK-LABEL: @func()
; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.000000e+00 monotonic, align 4
define void @func() {
%dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic
%unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
Expand Up @@ -21,12 +21,12 @@
; Instance of new type, aligned to max of element alignment
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8

; Use in func rewritten to access struct at address zero, which prints as null
; Use in func rewritten to access struct at address zero
; CHECK-LABEL: @func()
; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.0
; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4
; CHECK: %val1 = add i32 %val0, 4
; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4
; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
define void @func() {
%dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic
Expand All @@ -41,7 +41,7 @@ define void @func() {
; CHECK-LABEL: @kern_call()
; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK: call void @func()
; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 4
define amdgpu_kernel void @kern_call() {
call void @func()
%dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic
Expand Down

0 comments on commit 748db5b

Please sign in to comment.