Skip to content

Commit

Permalink
[NVPTX] Add NVPTXCtorDtorLoweringPass to handle global ctors / dtors
Browse files Browse the repository at this point in the history
This patch mostly adapts the existing AMDGPUCtorDtorLoweringPass for use
by the Nvidia backend. This pass transforms the ctor / dtor list into a
kernel call that can be used to invoke those functinos. Furthermore, we
emit globals such that the names and addresses of these constructor
functions can be found by the driver. Unfortunately, since NVPTX has no
way to emit variables at a named section, nor a functioning linker to
provide the begin / end symbols, we need to mangle these names and have
an external application find them.

This work is related to the work in D149398 and D149340.

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D149451
  • Loading branch information
jhuber6 committed May 4, 2023
1 parent 909095a commit f05ce90
Show file tree
Hide file tree
Showing 10 changed files with 232 additions and 7 deletions.
18 changes: 15 additions & 3 deletions clang/lib/Driver/ToolChains/Cuda.cpp
Expand Up @@ -695,8 +695,9 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
/// toolchain.
NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
const llvm::Triple &HostTriple,
const ArgList &Args)
: ToolChain(D, Triple, Args), CudaInstallation(D, HostTriple, Args) {
const ArgList &Args, bool Freestanding = false)
: ToolChain(D, Triple, Args), CudaInstallation(D, HostTriple, Args),
Freestanding(Freestanding) {
if (CudaInstallation.isValid()) {
CudaInstallation.WarnIfUnsupportedVersion();
getProgramPaths().push_back(std::string(CudaInstallation.getBinPath()));
Expand All @@ -711,7 +712,8 @@ NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
const ArgList &Args)
: NVPTXToolChain(D, Triple,
llvm::Triple(llvm::sys::getDefaultTargetTriple()), Args) {}
llvm::Triple(llvm::sys::getDefaultTargetTriple()), Args,
/*Freestanding=*/true) {}

llvm::opt::DerivedArgList *
NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
Expand All @@ -735,6 +737,16 @@ NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
return DAL;
}

void NVPTXToolChain::addClangTargetOptions(
const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
Action::OffloadKind DeviceOffloadingKind) const {
// If we are compiling with a standalone NVPTX toolchain we want to try to
// mimic a standard environment as much as possible. So we enable lowering
// ctor / dtor functions to global symbols that can be registered.
if (Freestanding)
CC1Args.append({"-mllvm", "--nvptx-lower-global-ctor-dtor"});
}

bool NVPTXToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
const Option &O = A->getOption();
return (O.matches(options::OPT_gN_Group) &&
Expand Down
12 changes: 10 additions & 2 deletions clang/lib/Driver/ToolChains/Cuda.h
Expand Up @@ -132,8 +132,8 @@ namespace toolchains {
class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain {
public:
NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
const llvm::Triple &HostTriple,
const llvm::opt::ArgList &Args);
const llvm::Triple &HostTriple, const llvm::opt::ArgList &Args,
bool Freestanding);

NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
const llvm::opt::ArgList &Args);
Expand All @@ -142,6 +142,11 @@ class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain {
TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch,
Action::OffloadKind DeviceOffloadKind) const override;

void
addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
llvm::opt::ArgStringList &CC1Args,
Action::OffloadKind DeviceOffloadKind) const override;

// Never try to use the integrated assembler with CUDA; always fork out to
// ptxas.
bool useIntegratedAs() const override { return false; }
Expand All @@ -168,6 +173,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain {
protected:
Tool *buildAssembler() const override; // ptxas.
Tool *buildLinker() const override; // nvlink.

private:
bool Freestanding = false;
};

class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain {
Expand Down
9 changes: 9 additions & 0 deletions clang/test/Driver/cuda-cross-compiling.c
Expand Up @@ -68,3 +68,12 @@
// DEFAULT: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_35" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s"
// DEFAULT-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_35" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c"
// DEFAULT-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_35" {{.*}} "[[CUBIN]].cubin"

//
// Test to ensure that we enable handling global constructors in a freestanding
// Nvidia compilation.
//
// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_70 %s -### 2>&1 \
// RUN: | FileCheck -check-prefix=LOWERING %s

// LOWERING: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-mllvm" "--nvptx-lower-global-ctor-dtor"
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/CMakeLists.txt
Expand Up @@ -37,6 +37,7 @@ set(NVPTXCodeGen_sources
NVVMIntrRange.cpp
NVVMReflect.cpp
NVPTXProxyRegErasure.cpp
NVPTXCtorDtorLowering.cpp
)

add_llvm_target(NVPTXCodeGen
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/NVPTX.h
Expand Up @@ -39,6 +39,7 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
llvm::CodeGenOpt::Level OptLevel);
ModulePass *createNVPTXAssignValidGlobalNamesPass();
ModulePass *createGenericToNVVMLegacyPass();
ModulePass *createNVPTXCtorDtorLoweringLegacyPass();
FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
FunctionPass *createNVVMReflectPass(unsigned int SmVersion);
MachineFunctionPass *createNVPTXPrologEpilogPass();
Expand Down
11 changes: 9 additions & 2 deletions llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
Expand Up @@ -92,6 +92,11 @@

using namespace llvm;

static cl::opt<bool>
LowerCtorDtor("nvptx-lower-global-ctor-dtor",
cl::desc("Lower GPU ctor / dtors to globals on the device."),
cl::init(false), cl::Hidden);

#define DEPOTNAME "__local_depot"

/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
Expand Down Expand Up @@ -788,12 +793,14 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
report_fatal_error("Module has aliases, which NVPTX does not support.");
return true; // error
}
if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors"))) {
if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors")) &&
!LowerCtorDtor) {
report_fatal_error(
"Module has a nontrivial global ctor, which NVPTX does not support.");
return true; // error
}
if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors"))) {
if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors")) &&
!LowerCtorDtor) {
report_fatal_error(
"Module has a nontrivial global dtor, which NVPTX does not support.");
return true; // error
Expand Down
116 changes: 116 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp
@@ -0,0 +1,116 @@
//===-- NVPTXCtorDtorLowering.cpp - Handle global ctors and dtors --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This pass creates a unified init and fini kernel with the required metadata
//===----------------------------------------------------------------------===//

#include "NVPTXCtorDtorLowering.h"
#include "NVPTX.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"

using namespace llvm;

#define DEBUG_TYPE "nvptx-lower-ctor-dtor"

static cl::opt<std::string>
GlobalStr("nvptx-lower-global-ctor-dtor-id",
cl::desc("Override unique ID of ctor/dtor globals."),
cl::init(""), cl::Hidden);

namespace {

static std::string getHash(StringRef Str) {
llvm::MD5 Hasher;
llvm::MD5::MD5Result Hash;
Hasher.update(Str);
Hasher.final(Hash);
return llvm::utohexstr(Hash.low(), /*LowerCase=*/true);
}

static bool createInitOrFiniGlobls(Module &M, StringRef GlobalName,
bool IsCtor) {
GlobalVariable *GV = M.getGlobalVariable(GlobalName);
if (!GV || !GV->hasInitializer())
return false;
ConstantArray *GA = dyn_cast<ConstantArray>(GV->getInitializer());
if (!GA || GA->getNumOperands() == 0)
return false;

// NVPTX has no way to emit variables at specific sections or support for
// the traditional constructor sections. Instead, we emit mangled global
// names so the runtime can build the list manually.
for (Value *V : GA->operands()) {
auto *CS = cast<ConstantStruct>(V);
auto *F = cast<Constant>(CS->getOperand(1));
uint64_t Priority = cast<ConstantInt>(CS->getOperand(0))->getSExtValue();
std::string PriorityStr = "." + std::to_string(Priority);
// We append a semi-unique hash and the priority to the global name.
std::string GlobalID =
!GlobalStr.empty() ? GlobalStr : getHash(M.getSourceFileName());
std::string NameStr =
((IsCtor ? "__init_array_object_" : "__fini_array_object_") +
F->getName() + "_" + GlobalID + "_" + std::to_string(Priority))
.str();
// PTX does not support exported names with '.' in them.
llvm::transform(NameStr, NameStr.begin(),
[](char c) { return c == '.' ? '_' : c; });

auto *GV = new GlobalVariable(M, F->getType(), /*IsConstant=*/true,
GlobalValue::ExternalLinkage, F, NameStr,
nullptr, GlobalValue::NotThreadLocal,
/*AddressSpace=*/4);
// This isn't respected by Nvidia, simply put here for clarity.
GV->setSection(IsCtor ? ".init_array" + PriorityStr
: ".fini_array" + PriorityStr);
GV->setVisibility(GlobalVariable::ProtectedVisibility);
appendToUsed(M, {GV});
}

GV->eraseFromParent();
return true;
}

static bool lowerCtorsAndDtors(Module &M) {
bool Modified = false;
Modified |= createInitOrFiniGlobls(M, "llvm.global_ctors", /*IsCtor =*/true);
Modified |= createInitOrFiniGlobls(M, "llvm.global_dtors", /*IsCtor =*/false);
return Modified;
}

class NVPTXCtorDtorLoweringLegacy final : public ModulePass {
public:
static char ID;
NVPTXCtorDtorLoweringLegacy() : ModulePass(ID) {}
bool runOnModule(Module &M) override { return lowerCtorsAndDtors(M); }
};

} // End anonymous namespace

PreservedAnalyses NVPTXCtorDtorLoweringPass::run(Module &M,
ModuleAnalysisManager &AM) {
return lowerCtorsAndDtors(M) ? PreservedAnalyses::none()
: PreservedAnalyses::all();
}

char NVPTXCtorDtorLoweringLegacy::ID = 0;
char &llvm::NVPTXCtorDtorLoweringLegacyPassID = NVPTXCtorDtorLoweringLegacy::ID;
INITIALIZE_PASS(NVPTXCtorDtorLoweringLegacy, DEBUG_TYPE,
"Lower ctors and dtors for NVPTX", false, false)

ModulePass *llvm::createNVPTXCtorDtorLoweringLegacyPass() {
return new NVPTXCtorDtorLoweringLegacy();
}
30 changes: 30 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.h
@@ -0,0 +1,30 @@
//===-- NVPTXCtorDtorLowering.h --------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXCTORDTORLOWERING_H
#define LLVM_LIB_TARGET_NVPTX_NVPTXCTORDTORLOWERING_H

#include "llvm/IR/PassManager.h"

namespace llvm {
class Module;
class PassRegistry;

extern char &NVPTXCtorDtorLoweringLegacyPassID;
extern void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);

/// Lower llvm.global_ctors and llvm.global_dtors to special kernels.
class NVPTXCtorDtorLoweringPass
: public PassInfoMixin<NVPTXCtorDtorLoweringPass> {
public:
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};

} // namespace llvm

#endif // LLVM_LIB_TARGET_NVPTX_NVPTXCTORDTORLOWERING_H
9 changes: 9 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
Expand Up @@ -15,6 +15,7 @@
#include "NVPTXAliasAnalysis.h"
#include "NVPTXAllocaHoisting.h"
#include "NVPTXAtomicLower.h"
#include "NVPTXCtorDtorLowering.h"
#include "NVPTXLowerAggrCopies.h"
#include "NVPTXMachineFunctionInfo.h"
#include "NVPTXTargetObjectFile.h"
Expand Down Expand Up @@ -68,8 +69,10 @@ void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
void initializeNVPTXAllocaHoistingPass(PassRegistry &);
void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
void initializeNVPTXAtomicLowerPass(PassRegistry &);
void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
void initializeNVPTXLowerAllocaPass(PassRegistry &);
void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
void initializeNVPTXLowerArgsPass(PassRegistry &);
void initializeNVPTXProxyRegErasurePass(PassRegistry &);
void initializeNVVMIntrRangePass(PassRegistry &);
Expand All @@ -95,6 +98,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
initializeNVPTXAtomicLowerPass(PR);
initializeNVPTXLowerArgsPass(PR);
initializeNVPTXLowerAllocaPass(PR);
initializeNVPTXCtorDtorLoweringLegacyPass(PR);
initializeNVPTXLowerAggrCopiesPass(PR);
initializeNVPTXProxyRegErasurePass(PR);
initializeNVPTXDAGToDAGISelPass(PR);
Expand Down Expand Up @@ -249,6 +253,10 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerPipelineParsingCallback(
[](StringRef PassName, ModulePassManager &PM,
ArrayRef<PassBuilder::PipelineElement>) {
if (PassName == "nvptx-lower-ctor-dtor") {
PM.addPass(NVPTXCtorDtorLoweringPass());
return true;
}
if (PassName == "generic-to-nvvm") {
PM.addPass(GenericToNVVMPass());
return true;
Expand Down Expand Up @@ -369,6 +377,7 @@ void NVPTXPassConfig::addIRPasses() {
}

addPass(createAtomicExpandPass());
addPass(createNVPTXCtorDtorLoweringLegacyPass());

// === LSR and other generic IR passes ===
TargetPassConfig::addIRPasses();
Expand Down
32 changes: 32 additions & 0 deletions llvm/test/CodeGen/NVPTX/lower-ctor-dtor.ll
@@ -0,0 +1,32 @@
; RUN: opt -S -mtriple=nvptx64-- -nvptx-lower-ctor-dtor < %s | FileCheck %s
; RUN: opt -S -mtriple=nvptx64-- -passes=nvptx-lower-ctor-dtor < %s | FileCheck %s
; RUN: opt -S -mtriple=nvptx64-- -passes=nvptx-lower-ctor-dtor \
; RUN: -nvptx-lower-global-ctor-dtor-id=unique_id < %s | FileCheck %s --check-prefix=GLOBAL

; Make sure we get the same result if we run multiple times
; RUN: opt -S -mtriple=nvptx64-- -passes=nvptx-lower-ctor-dtor,nvptx-lower-ctor-dtor < %s | FileCheck %s
; RUN: llc -nvptx-lower-global-ctor-dtor -mtriple=nvptx64-amd-amdhsa -mcpu=sm_70 -filetype=asm -o - < %s | FileCheck %s -check-prefix=VISIBILITY

@llvm.global_ctors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @foo, ptr null }]
@llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @bar, ptr null }]

; CHECK-NOT: @llvm.global_ctors
; CHECK-NOT: @llvm.global_dtors

; CHECK: @__init_array_object_foo_[[HASH:[0-9a-f]+]]_1 = protected addrspace(4) constant ptr @foo, section ".init_array.1"
; CHECK: @__fini_array_object_bar_[[HASH:[0-9a-f]+]]_1 = protected addrspace(4) constant ptr @bar, section ".fini_array.1"
; CHECK: @llvm.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @__init_array_object_foo_[[HASH]]_1 to ptr), ptr addrspacecast (ptr addrspace(4) @__fini_array_object_bar_[[HASH]]_1 to ptr)], section "llvm.metadata"
; GLOBAL: @__init_array_object_foo_unique_id_1 = protected addrspace(4) constant ptr @foo, section ".init_array.1"
; GLOBAL: @__fini_array_object_bar_unique_id_1 = protected addrspace(4) constant ptr @bar, section ".fini_array.1"
; GLOBAL: @llvm.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @__init_array_object_foo_unique_id_1 to ptr), ptr addrspacecast (ptr addrspace(4) @__fini_array_object_bar_unique_id_1 to ptr)], section "llvm.metadata"

; VISIBILITY: .visible .const .align 8 .u64 __init_array_object_foo_[[HASH:[0-9a-f]+]]_1 = foo;
; VISIBILITY: .visible .const .align 8 .u64 __fini_array_object_bar_[[HASH:[0-9a-f]+]]_1 = bar;

define internal void @foo() {
ret void
}

define internal void @bar() {
ret void
}

0 comments on commit f05ce90

Please sign in to comment.