diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2b457fe519d96..c96625092a76c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -793,6 +793,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks( PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); }); + + PB.registerFullLinkTimeOptimizationLastEPCallback( + [this](ModulePassManager &PM, OptimizationLevel Level) { + // We want to support the -lto-partitions=N option as "best effort". + // For that, we need to lower LDS earlier in the pipeline before the + // module is partitioned for codegen. + if (EnableLowerModuleLDS) + PM.addPass(AMDGPULowerModuleLDSPass(*this)); + }); } int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { diff --git a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll new file mode 100644 index 0000000000000..b813b8047bf24 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll @@ -0,0 +1,47 @@ + +; Default O0 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O0 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O1 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O1 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O2 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O2 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O3 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O3 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; First print will be from the New PM during the full LTO pipeline. +; Second print will be from the legacy PM during the CG pipeline. + +; CHECK: Running pass: AMDGPULowerModuleLDSPass on [module] +; CHECK: ModulePass Manager +; CHECK: Lower uses of LDS variables from non-kernel functions + +@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4 + +define amdgpu_kernel void @test() { +entry: + store i32 1, ptr addrspace(3) @lds + ret void +}