1,110 changes: 1,093 additions & 17 deletions clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Large diffs are not rendered by default.

267 changes: 267 additions & 0 deletions clang/tools/clang-linker-wrapper/OffloadWrapper.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
//===- OffloadWrapper.cpp ---------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "OffloadWrapper.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"

using namespace llvm;

namespace {

IntegerType *getSizeTTy(Module &M) {
LLVMContext &C = M.getContext();
switch (M.getDataLayout().getPointerTypeSize(Type::getInt8PtrTy(C))) {
case 4u:
return Type::getInt32Ty(C);
case 8u:
return Type::getInt64Ty(C);
}
llvm_unreachable("unsupported pointer type size");
}

// struct __tgt_offload_entry {
// void *addr;
// char *name;
// size_t size;
// int32_t flags;
// int32_t reserved;
// };
StructType *getEntryTy(Module &M) {
LLVMContext &C = M.getContext();
StructType *EntryTy = StructType::getTypeByName(C, "__tgt_offload_entry");
if (!EntryTy)
EntryTy = StructType::create("__tgt_offload_entry", Type::getInt8PtrTy(C),
Type::getInt8PtrTy(C), getSizeTTy(M),
Type::getInt32Ty(C), Type::getInt32Ty(C));
return EntryTy;
}

PointerType *getEntryPtrTy(Module &M) {
return PointerType::getUnqual(getEntryTy(M));
}

// struct __tgt_device_image {
// void *ImageStart;
// void *ImageEnd;
// __tgt_offload_entry *EntriesBegin;
// __tgt_offload_entry *EntriesEnd;
// };
StructType *getDeviceImageTy(Module &M) {
LLVMContext &C = M.getContext();
StructType *ImageTy = StructType::getTypeByName(C, "__tgt_device_image");
if (!ImageTy)
ImageTy = StructType::create("__tgt_device_image", Type::getInt8PtrTy(C),
Type::getInt8PtrTy(C), getEntryPtrTy(M),
getEntryPtrTy(M));
return ImageTy;
}

PointerType *getDeviceImagePtrTy(Module &M) {
return PointerType::getUnqual(getDeviceImageTy(M));
}

// struct __tgt_bin_desc {
// int32_t NumDeviceImages;
// __tgt_device_image *DeviceImages;
// __tgt_offload_entry *HostEntriesBegin;
// __tgt_offload_entry *HostEntriesEnd;
// };
StructType *getBinDescTy(Module &M) {
LLVMContext &C = M.getContext();
StructType *DescTy = StructType::getTypeByName(C, "__tgt_bin_desc");
if (!DescTy)
DescTy = StructType::create("__tgt_bin_desc", Type::getInt32Ty(C),
getDeviceImagePtrTy(M), getEntryPtrTy(M),
getEntryPtrTy(M));
return DescTy;
}

PointerType *getBinDescPtrTy(Module &M) {
return PointerType::getUnqual(getBinDescTy(M));
}

/// Creates binary descriptor for the given device images. Binary descriptor
/// is an object that is passed to the offloading runtime at program startup
/// and it describes all device images available in the executable or shared
/// library. It is defined as follows
///
/// __attribute__((visibility("hidden")))
/// extern __tgt_offload_entry *__start_omp_offloading_entries;
/// __attribute__((visibility("hidden")))
/// extern __tgt_offload_entry *__stop_omp_offloading_entries;
///
/// static const char Image0[] = { <Bufs.front() contents> };
/// ...
/// static const char ImageN[] = { <Bufs.back() contents> };
///
/// static const __tgt_device_image Images[] = {
/// {
/// Image0, /*ImageStart*/
/// Image0 + sizeof(Image0), /*ImageEnd*/
/// __start_omp_offloading_entries, /*EntriesBegin*/
/// __stop_omp_offloading_entries /*EntriesEnd*/
/// },
/// ...
/// {
/// ImageN, /*ImageStart*/
/// ImageN + sizeof(ImageN), /*ImageEnd*/
/// __start_omp_offloading_entries, /*EntriesBegin*/
/// __stop_omp_offloading_entries /*EntriesEnd*/
/// }
/// };
///
/// static const __tgt_bin_desc BinDesc = {
/// sizeof(Images) / sizeof(Images[0]), /*NumDeviceImages*/
/// Images, /*DeviceImages*/
/// __start_omp_offloading_entries, /*HostEntriesBegin*/
/// __stop_omp_offloading_entries /*HostEntriesEnd*/
/// };
///
/// Global variable that represents BinDesc is returned.
GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs) {
LLVMContext &C = M.getContext();
// Create external begin/end symbols for the offload entries table.
auto *EntriesB = new GlobalVariable(
M, getEntryTy(M), /*isConstant*/ true, GlobalValue::ExternalLinkage,
/*Initializer*/ nullptr, "__start_omp_offloading_entries");
EntriesB->setVisibility(GlobalValue::HiddenVisibility);
auto *EntriesE = new GlobalVariable(
M, getEntryTy(M), /*isConstant*/ true, GlobalValue::ExternalLinkage,
/*Initializer*/ nullptr, "__stop_omp_offloading_entries");
EntriesE->setVisibility(GlobalValue::HiddenVisibility);

// We assume that external begin/end symbols that we have created above will
// be defined by the linker. But linker will do that only if linker inputs
// have section with "omp_offloading_entries" name which is not guaranteed.
// So, we just create dummy zero sized object in the offload entries section
// to force linker to define those symbols.
auto *DummyInit =
ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u));
auto *DummyEntry = new GlobalVariable(
M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit,
"__dummy.omp_offloading.entry");
DummyEntry->setSection("omp_offloading_entries");
DummyEntry->setVisibility(GlobalValue::HiddenVisibility);

auto *Zero = ConstantInt::get(getSizeTTy(M), 0u);
Constant *ZeroZero[] = {Zero, Zero};

// Create initializer for the images array.
SmallVector<Constant *, 4u> ImagesInits;
ImagesInits.reserve(Bufs.size());
for (ArrayRef<char> Buf : Bufs) {
auto *Data = ConstantDataArray::get(C, Buf);
auto *Image = new GlobalVariable(M, Data->getType(), /*isConstant*/ true,
GlobalVariable::InternalLinkage, Data,
".omp_offloading.device_image");
Image->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);

auto *Size = ConstantInt::get(getSizeTTy(M), Buf.size());
Constant *ZeroSize[] = {Zero, Size};

auto *ImageB =
ConstantExpr::getGetElementPtr(Image->getValueType(), Image, ZeroZero);
auto *ImageE =
ConstantExpr::getGetElementPtr(Image->getValueType(), Image, ZeroSize);

ImagesInits.push_back(ConstantStruct::get(getDeviceImageTy(M), ImageB,
ImageE, EntriesB, EntriesE));
}

// Then create images array.
auto *ImagesData = ConstantArray::get(
ArrayType::get(getDeviceImageTy(M), ImagesInits.size()), ImagesInits);

auto *Images =
new GlobalVariable(M, ImagesData->getType(), /*isConstant*/ true,
GlobalValue::InternalLinkage, ImagesData,
".omp_offloading.device_images");
Images->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);

auto *ImagesB =
ConstantExpr::getGetElementPtr(Images->getValueType(), Images, ZeroZero);

// And finally create the binary descriptor object.
auto *DescInit = ConstantStruct::get(
getBinDescTy(M),
ConstantInt::get(Type::getInt32Ty(C), ImagesInits.size()), ImagesB,
EntriesB, EntriesE);

return new GlobalVariable(M, DescInit->getType(), /*isConstant*/ true,
GlobalValue::InternalLinkage, DescInit,
".omp_offloading.descriptor");
}

void createRegisterFunction(Module &M, GlobalVariable *BinDesc) {
LLVMContext &C = M.getContext();
auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
".omp_offloading.descriptor_reg", &M);
Func->setSection(".text.startup");

// Get __tgt_register_lib function declaration.
auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
/*isVarArg*/ false);
FunctionCallee RegFuncC =
M.getOrInsertFunction("__tgt_register_lib", RegFuncTy);

// Construct function body
IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
Builder.CreateCall(RegFuncC, BinDesc);
Builder.CreateRetVoid();

// Add this function to constructors.
// Set priority to 1 so that __tgt_register_lib is executed AFTER
// __tgt_register_requires (we want to know what requirements have been
// asked for before we load a libomptarget plugin so that by the time the
// plugin is loaded it can report how many devices there are which can
// satisfy these requirements).
appendToGlobalCtors(M, Func, /*Priority*/ 1);
}

void createUnregisterFunction(Module &M, GlobalVariable *BinDesc) {
LLVMContext &C = M.getContext();
auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
".omp_offloading.descriptor_unreg", &M);
Func->setSection(".text.startup");

// Get __tgt_unregister_lib function declaration.
auto *UnRegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M),
/*isVarArg*/ false);
FunctionCallee UnRegFuncC =
M.getOrInsertFunction("__tgt_unregister_lib", UnRegFuncTy);

// Construct function body
IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
Builder.CreateCall(UnRegFuncC, BinDesc);
Builder.CreateRetVoid();

// Add this function to global destructors.
// Match priority of __tgt_register_lib
appendToGlobalDtors(M, Func, /*Priority*/ 1);
}

} // namespace

Error wrapBinaries(Module &M, ArrayRef<ArrayRef<char>> Images) {
GlobalVariable *Desc = createBinDesc(M, Images);
if (!Desc)
return createStringError(inconvertibleErrorCode(),
"No binary descriptors created.");
createRegisterFunction(M, Desc);
createUnregisterFunction(M, Desc);
return Error::success();
}
20 changes: 20 additions & 0 deletions clang/tools/clang-linker-wrapper/OffloadWrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//===- OffloadWrapper.h -------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_TOOLS_CLANG_LINKER_WRAPPER_OFFLOAD_WRAPPER_H
#define LLVM_CLANG_TOOLS_CLANG_LINKER_WRAPPER_OFFLOAD_WRAPPER_H

#include "llvm/ADT/ArrayRef.h"
#include "llvm/IR/Module.h"

/// Wrap the input device images into the module \p M as global symbols and
/// registers the images with the OpenMP Offloading runtime libomptarget.
llvm::Error wrapBinaries(llvm::Module &M,
llvm::ArrayRef<llvm::ArrayRef<char>> Images);

#endif
7 changes: 7 additions & 0 deletions llvm/lib/Passes/PassBuilderPipelines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1454,6 +1454,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
}

// Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
MPM.addPass(OpenMPOptPass());

// Remove unused virtual tables to improve the quality of code generated by
// whole-program devirtualization and bitset lowering.
MPM.addPass(GlobalDCEPass());
Expand Down Expand Up @@ -1648,6 +1651,10 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,

addVectorPasses(Level, MainFPM, /* IsFullLTO */ true);

// Run the OpenMPOpt CGSCC pass again late.
MPM.addPass(
createModuleToPostOrderCGSCCPassAdaptor(OpenMPOptCGSCCPass()));

invokePeepholeEPCallbacks(MainFPM, Level);
MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM),
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/Other/new-pm-lto-defaults.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

; CHECK-O: Running pass: Annotation2Metadata
; CHECK-O-NEXT: Running pass: CrossDSOCFIPass
; CHECK-O-NEXT: Running pass: OpenMPOptPass
; CHECK-O-NEXT: Running pass: GlobalDCEPass
; CHECK-O-NEXT: Running pass: ForceFunctionAttrsPass
; CHECK-O-NEXT: Running pass: InferFunctionAttrsPass
Expand Down Expand Up @@ -87,6 +88,7 @@
; CHECK-O23SZ-NEXT: Running analysis: GlobalsAA on [module]
; CHECK-O23SZ-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}AAManager
; CHECK-O23SZ-NEXT: Invalidating analysis: AAManager on foo
; CHECK-O23SZ-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass on foo
; CHECK-O23SZ-NEXT: Running analysis: LoopAnalysis on foo
; CHECK-O23SZ-NEXT: Running pass: LCSSAPass on foo
Expand Down
2 changes: 2 additions & 0 deletions openmp/libomptarget/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@ endif()
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newRTL")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newDriver")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newRTL")
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newDriver")

# Once the plugins for the different targets are validated, they will be added to
# the list of supported targets in the current system.
Expand Down
2 changes: 1 addition & 1 deletion openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,6 @@ if (${amdgpu_arch_result})
libomptarget_say("Not generating amdgcn test targets as amdgpu-arch exited with ${amdgpu_arch_result}")
else()
# Report to the parent scope that we are building a plugin for amdgpu
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa-newRTL" PARENT_SCOPE)
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa-newRTL amdgcn-amd-amdhsa-newDriver " PARENT_SCOPE)
endif()

2 changes: 1 addition & 1 deletion openmp/libomptarget/plugins/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ target_link_libraries(omptarget.rtl.cuda
# Otherwise this plugin is being built speculatively and there may be no cuda available
if (LIBOMPTARGET_CAN_LINK_LIBCUDA OR LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
libomptarget_say("Enable tests using CUDA plugin")
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda-newRTL" PARENT_SCOPE)
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda-newRTL nvptx64-nvidia-cuda-newDriver" PARENT_SCOPE)
else()
libomptarget_say("Disabling tests using CUDA plugin as cuda may not be available")
endif()
6 changes: 5 additions & 1 deletion openmp/libomptarget/test/lit.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,16 @@ else: # Unices
config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir
if config.libomptarget_current_target.endswith('-newRTL'):
config.test_flags += " -fopenmp-target-new-runtime"
else:
elif not config.libomptarget_current_target.endswith('-newDriver'):
config.test_flags += " -fno-openmp-target-new-runtime"
if config.libomptarget_current_target.endswith('-newDriver'):
config.test_flags += " -fopenmp-new-driver"

def remove_newRTL_suffix_if_present(name):
if name.endswith('-newRTL'):
return name[:-7]
elif name.endswith('-newDriver'):
return name[:-10]
else:
return name

Expand Down
29 changes: 29 additions & 0 deletions openmp/libomptarget/test/offloading/static_linking.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// RUN: %libomptarget-compile-generic -DLIBRARY -c -o %t.o
// RUN: llvm-ar rcs %t.a %t.o
// RUN: %libomptarget-compile-generic %t.a && %libomptarget-run-generic 2>&1 | %fcheck-generic

// REQUIRES: nvptx64-nvidia-cuda-newDriver
// REQUIRES: amdgcn-amd-amdhsa-newDriver

#ifdef LIBRARY
int x = 42;
#pragma omp declare target(x)

int foo() {
int value;
#pragma omp target map(from : value)
value = x;
return value;
}
#else
#include <stdio.h>
int foo();

int main() {
int x = foo();

// CHECK: PASS
if (x == 42)
printf("PASS\n");
}
#endif
2 changes: 2 additions & 0 deletions openmp/libomptarget/test/unified_shared_memory/api.c
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
// RUN: %libomptarget-compile-run-and-check-generic
// XFAIL: nvptx64-nvidia-cuda
// XFAIL: nvptx64-nvidia-cuda-newRTL
// XFAIL: nvptx64-nvidia-cuda-newDriver

// Fails on amdgpu with error: GPU Memory Error
// XFAIL: amdgcn-amd-amdhsa
// XFAIL: amdgcn-amd-amdhsa-newRTL
// XFAIL: amdgcn-amd-amdhsa-newDriver

#include <stdio.h>
#include <omp.h>
Expand Down