Skip to content

Commit

Permalink
[NVPTX] Allow compiling LLVM-IR without -march set (#79873)
Browse files Browse the repository at this point in the history
Summary:
The NVPTX tools require an architecture to be used, however if we are
creating generic LLVM-IR we should be able to leave it unspecified. This
will result in the `target-cpu` attributes not being set on the
functions so it can be changed when linked into code. This allows the
standalone `--target=nvptx64-nvidia-cuda` toolchain to create LLVM-IR
simmilar to how CUDA's deviceRTL looks from C/C++
  • Loading branch information
jhuber6 committed Jan 31, 2024
1 parent c19436e commit 7155c1e
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 19 deletions.
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/DiagnosticDriverKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def warn_drv_avr_stdlib_not_linked: Warning<
InGroup<AVRRtlibLinkingQuirks>;
def err_drv_cuda_bad_gpu_arch : Error<"unsupported CUDA gpu architecture: %0">;
def err_drv_offload_bad_gpu_arch : Error<"unsupported %0 gpu architecture: %1">;
def err_drv_offload_missing_gpu_arch : Error<
"Must pass in an explicit %0 gpu architecture to '%1'">;
def err_drv_no_cuda_installation : Error<
"cannot find CUDA installation; provide its path via '--cuda-path', or pass "
"'-nocudainc' to build without CUDA includes">;
Expand Down
9 changes: 7 additions & 2 deletions clang/lib/Basic/Targets/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
// Define available target features
// These must be defined in sorted order!
NoAsmVariants = true;
GPU = CudaArch::SM_20;
GPU = CudaArch::UNUSED;

if (TargetPointerWidth == 32)
resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
Expand Down Expand Up @@ -169,6 +169,11 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
MacroBuilder &Builder) const {
Builder.defineMacro("__PTX__");
Builder.defineMacro("__NVPTX__");

// Skip setting architecture dependent macros if undefined.
if (GPU == CudaArch::UNUSED && !HostTarget)
return;

if (Opts.CUDAIsDevice || Opts.OpenMPIsTargetDevice || !HostTarget) {
// Set __CUDA_ARCH__ for the GPU specified.
std::string CUDAArchCode = [this] {
Expand Down Expand Up @@ -220,10 +225,10 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
case CudaArch::Generic:
case CudaArch::LAST:
break;
case CudaArch::UNUSED:
case CudaArch::UNKNOWN:
assert(false && "No GPU arch when compiling CUDA device code.");
return "";
case CudaArch::UNUSED:
case CudaArch::SM_20:
return "200";
case CudaArch::SM_21:
Expand Down
3 changes: 2 additions & 1 deletion clang/lib/Basic/Targets/NVPTX.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
StringRef CPU,
const std::vector<std::string> &FeaturesVec) const override {
Features[CudaArchToString(GPU)] = true;
if (GPU != CudaArch::UNUSED)
Features[CudaArchToString(GPU)] = true;
Features["ptx" + std::to_string(PTXVersion)] = true;
return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
}
Expand Down
20 changes: 14 additions & 6 deletions clang/lib/Driver/ToolChains/Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,11 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
GPUArchName = JA.getOffloadingArch();
} else {
GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);
assert(!GPUArchName.empty() && "Must have an architecture passed in.");
if (GPUArchName.empty()) {
C.getDriver().Diag(diag::err_drv_offload_missing_gpu_arch)
<< getToolChain().getArchName() << getShortName();
return;
}
}

// Obtain architecture from the action.
Expand Down Expand Up @@ -593,7 +597,11 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-v");

StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ);
assert(!GPUArch.empty() && "At least one GPU Arch required for nvlink.");
if (GPUArch.empty()) {
C.getDriver().Diag(diag::err_drv_offload_missing_gpu_arch)
<< getToolChain().getArchName() << getShortName();
return;
}

CmdArgs.push_back("-arch");
CmdArgs.push_back(Args.MakeArgString(GPUArch));
Expand Down Expand Up @@ -726,9 +734,8 @@ NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
llvm::opt::DerivedArgList *
NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
StringRef BoundArch,
Action::OffloadKind DeviceOffloadKind) const {
DerivedArgList *DAL =
ToolChain::TranslateArgs(Args, BoundArch, DeviceOffloadKind);
Action::OffloadKind OffloadKind) const {
DerivedArgList *DAL = ToolChain::TranslateArgs(Args, BoundArch, OffloadKind);
if (!DAL)
DAL = new DerivedArgList(Args.getBaseArgs());

Expand All @@ -738,7 +745,8 @@ NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
if (!llvm::is_contained(*DAL, A))
DAL->append(A);

if (!DAL->hasArg(options::OPT_march_EQ)) {
// TODO: We should accept 'generic' as a valid architecture.
if (!DAL->hasArg(options::OPT_march_EQ) && OffloadKind != Action::OFK_None) {
DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
CudaArchToString(CudaArch::CudaDefault));
} else if (DAL->getLastArgValue(options::OPT_march_EQ) == "native") {
Expand Down
24 changes: 14 additions & 10 deletions clang/test/Driver/cuda-cross-compiling.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,6 @@

// LINK: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "{{.*}}.cubin"

//
// Test the generated arguments default to a value with no architecture.
//
// RUN: %clang --target=nvptx64-nvidia-cuda -### --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
// RUN: | FileCheck -check-prefix=DEFAULT %s

// DEFAULT: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_52" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s"
// DEFAULT-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_52" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c"
// DEFAULT-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_52" {{.*}} "[[CUBIN]].cubin"

//
// Test to ensure that we enable handling global constructors in a freestanding
// Nvidia compilation.
Expand All @@ -77,3 +67,17 @@
// RUN: | FileCheck -check-prefix=LOWERING %s

// LOWERING: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-mllvm" "--nvptx-lower-global-ctor-dtor"

//
// Tests for handling a missing architecture.
//
// RUN: not %clang -target nvptx64-nvidia-cuda %s -### 2>&1 \
// RUN: | FileCheck -check-prefix=MISSING %s

// MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'ptxas'
// MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'nvlink'

// RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \
// RUN: | FileCheck -check-prefix=GENERIC %s

// GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu"
12 changes: 12 additions & 0 deletions clang/test/Preprocessor/predefined-arch-macros.c
Original file line number Diff line number Diff line change
Expand Up @@ -4292,6 +4292,18 @@
// RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
// CHECK_SYSTEMZ_ZVECTOR: #define __VEC__ 10304

// Begin nvptx tests ----------------

// RUN: %clang -march=sm_75 -E -dM %s -o - 2>&1 \
// RUN: -target nvptx64-unknown-unknown \
// RUN: | FileCheck -match-full-lines %s -check-prefixes=CHECK_NVPTX,CHECK_ARCH_SM_75
// RUN: %clang -E -dM %s -o - 2>&1 \
// RUN: -target nvptx64-unknown-unknown \
// RUN: | FileCheck -match-full-lines %s -check-prefixes=CHECK_NVPTX,CHECK_ARCH_UNSET
// CHECK_ARCH_SM_75: #define __CUDA_ARCH__ 750
// CHECK_ARCH_UNSET-NOT: #define __CUDA_ARCH__
// CHECK_NVPTX: #define __NVPTX__ 1

// Begin amdgcn tests ----------------

// RUN: %clang -mcpu=gfx803 -E -dM %s -o - 2>&1 \
Expand Down

0 comments on commit 7155c1e

Please sign in to comment.