diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 8ef6700ecdc78..64b87ecdc9752 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -260,6 +260,7 @@ LANGOPT(OpenMPTargetDebug , 32, 0, "Enable debugging in the OpenMP offloading de LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.") LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.") +LANGOPT(OpenMPGlobalizeToGlobalSpace , 1, 0, "Globalize to global space for the globalized variables") LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.") LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region") LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4b1fcf1db1ad0..dd742ead7c0f2 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3497,6 +3497,10 @@ def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested HelpText<"Assert no nested parallel regions in the GPU">, MarshallingInfoFlag>; +def fopenmp_globalize_to_global_space : Flag<["-"], "fopenmp-globalize-to-global-space">, + HelpText<"Globalize to global space for the globalized variables">, + MarshallingInfoFlag>; + } // let Group = f_Group } // let Visibility = [ClangOption, CC1Option, FC1Option] } // let Flags = [NoArgumentUnused, HelpHidden] diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h index bf5ab171d720d..fe5beff05134a 100644 --- a/clang/lib/CodeGen/CGBuilder.h +++ b/clang/lib/CodeGen/CGBuilder.h @@ -152,6 +152,15 @@ class CGBuilderTy : public CGBuilderBaseTy { Addr.isKnownNonNull()); } + /// Cast the element type of the given address to a different type, + /// preserving information like the alignment and address space. + Address CreateElementBitCast(Address Addr, llvm::Type *Ty, + const llvm::Twine &Name = "") { + auto *PtrTy = Ty->getPointerTo(Addr.getAddressSpace()); + return Address(CreateBitCast(Addr.getPointer(), PtrTy, Name), Ty, + Addr.getAlignment(), Addr.isKnownNonNull()); + } + using CGBuilderBaseTy::CreatePointerBitCastOrAddrSpaceCast; Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index dc42faf8dbb9f..691af33dc239d 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -2531,48 +2531,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg, (IPD->getParameterKind() == ImplicitParamKind::ThreadPrivateVar); } - Address DeclPtr = Address::invalid(); - Address AllocaPtr = Address::invalid(); - bool DoStore = false; - bool IsScalar = hasScalarEvaluationKind(Ty); - bool UseIndirectDebugAddress = false; - - // If we already have a pointer to the argument, reuse the input pointer. - if (Arg.isIndirect()) { - DeclPtr = Arg.getIndirectAddress(); - DeclPtr = DeclPtr.withElementType(ConvertTypeForMem(Ty)); - // Indirect argument is in alloca address space, which may be different - // from the default address space. - auto AllocaAS = CGM.getASTAllocaAddressSpace(); - auto *V = DeclPtr.getPointer(); - AllocaPtr = DeclPtr; - - // For truly ABI indirect arguments -- those that are not `byval` -- store - // the address of the argument on the stack to preserve debug information. - ABIArgInfo ArgInfo = CurFnInfo->arguments()[ArgNo - 1].info; - if (ArgInfo.isIndirect()) - UseIndirectDebugAddress = !ArgInfo.getIndirectByVal(); - if (UseIndirectDebugAddress) { - auto PtrTy = getContext().getPointerType(Ty); - AllocaPtr = CreateMemTemp(PtrTy, getContext().getTypeAlignInChars(PtrTy), - D.getName() + ".indirect_addr"); - EmitStoreOfScalar(V, AllocaPtr, /* Volatile */ false, PtrTy); - } - - auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS; - auto DestLangAS = - getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default; - if (SrcLangAS != DestLangAS) { - assert(getContext().getTargetAddressSpace(SrcLangAS) == - CGM.getDataLayout().getAllocaAddrSpace()); - auto DestAS = getContext().getTargetAddressSpace(DestLangAS); - auto *T = llvm::PointerType::get(getLLVMContext(), DestAS); - DeclPtr = - DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast( - *this, V, SrcLangAS, DestLangAS, T, true), - DeclPtr.isKnownNonNull()); - } - + auto PushCleanupIfNeeded = [this, Ty, &D](Address DeclPtr) { // Push a destructor cleanup for this parameter if the ABI requires it. // Don't push a cleanup in a thunk for a method that will also emit a // cleanup. @@ -2588,87 +2547,126 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg, EHStack.stable_begin(); } } + }; + + Address DeclPtr = Address::invalid(); + Address AllocaPtr = Address::invalid(); + Address OpenMPLocalAddr = + getLangOpts().OpenMP + ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D) + : Address::invalid(); + bool DoStore = false; + bool IsScalar = hasScalarEvaluationKind(Ty); + bool UseIndirectDebugAddress = false; + if (OpenMPLocalAddr.isValid()) { + DeclPtr = OpenMPLocalAddr; + AllocaPtr = DeclPtr; + LValue Dst = MakeAddrLValue(DeclPtr, Ty); + if (Arg.isIndirect()) { + LValue Src = MakeAddrLValue(Arg.getIndirectAddress(), Ty); + callCStructCopyConstructor(Dst, Src); + PushCleanupIfNeeded(Arg.getIndirectAddress()); + } else { + EmitStoreOfScalar(Arg.getDirectValue(), Dst, /* isInitialization */ true); + } } else { - // Check if the parameter address is controlled by OpenMP runtime. - Address OpenMPLocalAddr = - getLangOpts().OpenMP - ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D) - : Address::invalid(); - if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) { - DeclPtr = OpenMPLocalAddr; + // If we already have a pointer to the argument, reuse the input pointer. + if (Arg.isIndirect()) { + // If we have a prettier pointer type at this point, bitcast to that. + DeclPtr = Arg.getIndirectAddress(); + DeclPtr = Builder.CreateElementBitCast(DeclPtr, ConvertTypeForMem(Ty), + D.getName()); + // Indirect argument is in alloca address space, which may be different + // from the default address space. + auto AllocaAS = CGM.getASTAllocaAddressSpace(); + auto *V = DeclPtr.getPointer(); AllocaPtr = DeclPtr; + auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS; + auto DestLangAS = + getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default; + if (SrcLangAS != DestLangAS) { + assert(getContext().getTargetAddressSpace(SrcLangAS) == + CGM.getDataLayout().getAllocaAddrSpace()); + auto DestAS = getContext().getTargetAddressSpace(DestLangAS); + auto *T = DeclPtr.getElementType()->getPointerTo(DestAS); + DeclPtr = + DeclPtr.withPointer(getTargetHooks().performAddrSpaceCast( + *this, V, SrcLangAS, DestLangAS, T, true), + DeclPtr.isKnownNonNull()); + } + PushCleanupIfNeeded(DeclPtr); } else { - // Otherwise, create a temporary to hold the value. + // Create a temporary to hold the value. DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D), D.getName() + ".addr", &AllocaPtr); + DoStore = true; } - DoStore = true; - } - - llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr); - - LValue lv = MakeAddrLValue(DeclPtr, Ty); - if (IsScalar) { - Qualifiers qs = Ty.getQualifiers(); - if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) { - // We honor __attribute__((ns_consumed)) for types with lifetime. - // For __strong, it's handled by just skipping the initial retain; - // otherwise we have to balance out the initial +1 with an extra - // cleanup to do the release at the end of the function. - bool isConsumed = D.hasAttr(); - - // If a parameter is pseudo-strong then we can omit the implicit retain. - if (D.isARCPseudoStrong()) { - assert(lt == Qualifiers::OCL_Strong && - "pseudo-strong variable isn't strong?"); - assert(qs.hasConst() && "pseudo-strong variable should be const!"); - lt = Qualifiers::OCL_ExplicitNone; - } - // Load objects passed indirectly. - if (Arg.isIndirect() && !ArgVal) - ArgVal = Builder.CreateLoad(DeclPtr); - - if (lt == Qualifiers::OCL_Strong) { - if (!isConsumed) { - if (CGM.getCodeGenOpts().OptimizationLevel == 0) { - // use objc_storeStrong(&dest, value) for retaining the - // object. But first, store a null into 'dest' because - // objc_storeStrong attempts to release its old value. - llvm::Value *Null = CGM.EmitNullConstant(D.getType()); - EmitStoreOfScalar(Null, lv, /* isInitialization */ true); - EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true); - DoStore = false; - } - else - // Don't use objc_retainBlock for block pointers, because we - // don't want to Block_copy something just because we got it - // as a parameter. - ArgVal = EmitARCRetainNonBlock(ArgVal); - } - } else { - // Push the cleanup for a consumed parameter. - if (isConsumed) { - ARCPreciseLifetime_t precise = (D.hasAttr() - ? ARCPreciseLifetime : ARCImpreciseLifetime); - EHStack.pushCleanup(getARCCleanupKind(), ArgVal, - precise); + llvm::Value *ArgVal = (DoStore ? Arg.getDirectValue() : nullptr); + + LValue lv = MakeAddrLValue(DeclPtr, Ty); + if (IsScalar) { + Qualifiers qs = Ty.getQualifiers(); + if (Qualifiers::ObjCLifetime lt = qs.getObjCLifetime()) { + // We honor __attribute__((ns_consumed)) for types with lifetime. + // For __strong, it's handled by just skipping the initial retain; + // otherwise we have to balance out the initial +1 with an extra + // cleanup to do the release at the end of the function. + bool isConsumed = D.hasAttr(); + + // If a parameter is pseudo-strong then we can omit the implicit retain. + if (D.isARCPseudoStrong()) { + assert(lt == Qualifiers::OCL_Strong && + "pseudo-strong variable isn't strong?"); + assert(qs.hasConst() && "pseudo-strong variable should be const!"); + lt = Qualifiers::OCL_ExplicitNone; } - if (lt == Qualifiers::OCL_Weak) { - EmitARCInitWeak(DeclPtr, ArgVal); - DoStore = false; // The weak init is a store, no need to do two. + // Load objects passed indirectly. + if (Arg.isIndirect() && !ArgVal) + ArgVal = Builder.CreateLoad(DeclPtr); + + if (lt == Qualifiers::OCL_Strong) { + if (!isConsumed) { + if (CGM.getCodeGenOpts().OptimizationLevel == 0) { + // use objc_storeStrong(&dest, value) for retaining the + // object. But first, store a null into 'dest' because + // objc_storeStrong attempts to release its old value. + llvm::Value *Null = CGM.EmitNullConstant(D.getType()); + EmitStoreOfScalar(Null, lv, /* isInitialization */ true); + EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true); + DoStore = false; + } else + // Don't use objc_retainBlock for block pointers, because we + // don't want to Block_copy something just because we got it + // as a parameter. + ArgVal = EmitARCRetainNonBlock(ArgVal); + } + } else { + // Push the cleanup for a consumed parameter. + if (isConsumed) { + ARCPreciseLifetime_t precise = + (D.hasAttr() ? ARCPreciseLifetime + : ARCImpreciseLifetime); + EHStack.pushCleanup(getARCCleanupKind(), + ArgVal, precise); + } + + if (lt == Qualifiers::OCL_Weak) { + EmitARCInitWeak(DeclPtr, ArgVal); + DoStore = false; // The weak init is a store, no need to do two. + } } - } - // Enter the cleanup scope. - EmitAutoVarWithLifetime(*this, D, DeclPtr, lt); + // Enter the cleanup scope. + EmitAutoVarWithLifetime(*this, D, DeclPtr, lt); + } } - } - // Store the initial value into the alloca. - if (DoStore) - EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true); + // Store the initial value into the alloca. + if (DoStore) + EmitStoreOfScalar(ArgVal, lv, /* isInitialization */ true); + } setAddrOfLocalVar(&D, DeclPtr); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 299ee1460b3db..8f0c7caa2f3b4 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1083,10 +1083,12 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, // Allocate space for the variable to be globalized llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; - llvm::CallBase *VoidPtr = - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_alloc_shared), - AllocArgs, VD->getName()); + llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace + ? OMPRTL_malloc + : OMPRTL___kmpc_alloc_shared), + AllocArgs, VD->getName()); // FIXME: We should use the variables actual alignment as an argument. VoidPtr->addRetAttr(llvm::Attribute::get( CGM.getLLVMContext(), llvm::Attribute::Alignment, @@ -1149,10 +1151,12 @@ CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF, // Allocate space for this VLA object to be globalized. llvm::Value *AllocArgs[] = {Size}; - llvm::CallBase *VoidPtr = - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_alloc_shared), - AllocArgs, VD->getName()); + llvm::CallBase *VoidPtr = CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace + ? OMPRTL_malloc + : OMPRTL___kmpc_alloc_shared), + AllocArgs, VD->getName()); VoidPtr->addRetAttr(llvm::Attribute::get( CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity())); @@ -1178,20 +1182,29 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) { // globalized in the prolog (i.e. emitGenericVarsProlog). for (const auto &AddrSizePair : llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_free_shared), - {AddrSizePair.first, AddrSizePair.second}); + if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace) + CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free), + {AddrSizePair.first}); + else + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + {AddrSizePair.first, AddrSizePair.second}); } // Deallocate the memory for each globalized value for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { const auto *VD = cast(Rec.first); I->getSecond().MappedParams->restore(CGF); - llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal, - CGF.getTypeSize(VD->getType())}; - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_free_shared), - FreeArgs); + if (CGM.getLangOpts().OpenMPGlobalizeToGlobalSpace) + CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), OMPRTL_free), + {Rec.second.GlobalizedVal}); + else + CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), + OMPRTL___kmpc_free_shared), + {Rec.second.GlobalizedVal, CGF.getTypeSize(VD->getType())}); } } } diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 055884d275ce1..92aadc8fd4ce6 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6519,6 +6519,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-fopenmp-offload-mandatory"); if (Args.hasArg(options::OPT_fopenmp_force_usm)) CmdArgs.push_back("-fopenmp-force-usm"); + if (Args.hasArg(options::OPT_fopenmp_globalize_to_global_space)) + CmdArgs.push_back("-fopenmp-globalize-to-global-space"); break; default: // By default, if Clang doesn't know how to generate useful OpenMP code diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index c60be2789bd61..32a051799a6e3 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -137,6 +137,13 @@ static constexpr OptTable::Info InfoTable[] = { #undef OPTION }; +/// Host RPC module that will be shared to the corresponding pass. +Module *HostModule = nullptr; +/// We only need to generate the host RPC module once. +bool IsHostModuleGenerated = false; +/// Host RPC object file. +StringRef HostRPCObjFile; + class WrapperOptTable : public opt::GenericOptTable { public: WrapperOptTable() : opt::GenericOptTable(InfoTable) {} @@ -614,10 +621,12 @@ std::vector getTargetFeatures(ArrayRef InputFiles) { return UnifiedFeatures; } -template > +template , + typename PostHookTy = function_ref> std::unique_ptr createLTO( const ArgList &Args, const std::vector &Features, - ModuleHook Hook = [](size_t, const Module &) { return true; }) { + PreHookTy PreHook = [](size_t, const Module &) { return true; }, + PostHookTy PostHook = [](size_t, const Module &) { return true; }) { const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); // We need to remove AMD's target-id from the processor if present. StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first; @@ -672,10 +681,10 @@ std::unique_ptr createLTO( return true; }; } - Conf.PostOptModuleHook = Hook; - Conf.CGFileType = (Triple.isNVPTX() || SaveTemps) - ? CodeGenFileType::AssemblyFile - : CodeGenFileType::ObjectFile; + + Conf.PreOptModuleHook = PreHook; + Conf.PostOptModuleHook = PostHook; + Conf.CGFileType = Triple.isNVPTX() ? CodeGenFileType::AssemblyFile : CodeGenFileType::ObjectFile; // TODO: Handle remark files Conf.HasWholeProgramVisibility = Args.hasArg(OPT_whole_program); @@ -691,6 +700,58 @@ bool isValidCIdentifier(StringRef S) { [](char C) { return C == '_' || isAlnum(C); }); } +bool writeHostModule(std::string &FileName) { + if (!HostModule) + return false; + if (HostModule->getFunctionList().empty()) + return false; + + auto HostTriple = HostModule->getTargetTriple(); + FileName = + sys::path::filename(ExecutableName).str() + "-host-rpc-" + HostTriple; + auto TempFileOrErr = createOutputFile(FileName, "bc"); + if (!TempFileOrErr) + reportError(TempFileOrErr.takeError()); + int FD = -1; + if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD)) + reportError(errorCodeToError(EC)); + + auto Out = std::make_unique(FD, true); + WriteBitcodeToFile(*HostModule, *Out); + + return true; +} + +std::unique_ptr createHostRPCLTO(StringRef HostTriple) { + const llvm::Triple Triple(HostTriple); + lto::Config Conf; + lto::ThinBackend Backend; + Backend = + lto::createInProcessThinBackend(llvm::heavyweight_hardware_concurrency()); + + // TODO: host arch? + // Conf.CPU = Arch.str(); + Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple); + + // TODO: host features? + // Conf.MAttrs = Features; + Conf.CGOptLevel = *CodeGenOpt::getLevel(3); + Conf.OptLevel = 3; + Conf.UseDefaultPipeline = true; + Conf.DefaultTriple = Triple.getTriple(); + + LTOError = false; + Conf.DiagHandler = diagnosticHandler; + + Conf.PTO.LoopVectorization = Conf.OptLevel > 1; + Conf.PTO.SLPVectorization = Conf.OptLevel > 1; + Conf.CGFileType = CodeGenFileType::ObjectFile; + + Conf.HasWholeProgramVisibility = false; + + return std::make_unique(std::move(Conf), Backend); +} + Error linkBitcodeFiles(SmallVectorImpl &InputFiles, SmallVectorImpl &OutputFiles, const ArgList &Args) { @@ -776,14 +837,51 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, BitcodeOutput.push_back(*TempFileOrErr); return false; }; + auto AddHostModuleAddr = [&](size_t, const Module &M) { + if (!HostModule) + return true; + + Module &CM = const_cast(M); + auto *MD = CM.getOrInsertNamedMetadata("llvm.hostrpc.hostmodule"); + MD->clearOperands(); + MD->addOperand(MDTuple::get( + CM.getContext(), {ConstantAsMetadata::get(ConstantInt::get( + Type::getInt64Ty(CM.getContext()), + reinterpret_cast(HostModule)))})); + return true; + }; // We assume visibility of the whole program if every input file was bitcode. auto Features = getTargetFeatures(BitcodeInputFiles); - auto LTOBackend = Args.hasArg(OPT_embed_bitcode) || - Args.hasArg(OPT_builtin_bitcode_EQ) || - Args.hasArg(OPT_clang_backend) - ? createLTO(Args, Features, OutputBitcode) - : createLTO(Args, Features); + auto LTOBackend = + Args.hasArg(OPT_embed_bitcode) + ? createLTO(Args, Features, AddHostModuleAddr, OutputBitcode) + : createLTO(Args, Features, AddHostModuleAddr); + + LLVMContext &Ctx = LTOBackend->getContext(); + StringRef HostTriple = + Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple()); + std::unique_ptr HostModulePtr; + if (!IsHostModuleGenerated) { + HostModulePtr = std::make_unique( + sys::path::filename(ExecutableName).str() + "-host-rpc.bc", Ctx); + HostModule = HostModulePtr.get(); + HostModulePtr->setTargetTriple(HostTriple); + + std::string Msg; + const Target *T = + TargetRegistry::lookupTarget(HostModule->getTargetTriple(), Msg); + if (!T) + return createStringError(inconvertibleErrorCode(), Msg); + auto Options = + codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple(HostTriple)); + StringRef CPU = ""; + StringRef Features = ""; + std::unique_ptr TM( + T->createTargetMachine(HostTriple, CPU, Features, Options, Reloc::PIC_, + HostModule->getCodeModel())); + HostModule->setDataLayout(TM->createDataLayout()); + } // We need to resolve the symbols so the LTO backend knows which symbols need // to be kept or can be internalized. This is a simplified symbol resolution @@ -877,6 +975,57 @@ Error linkBitcodeFiles(SmallVectorImpl &InputFiles, if (Error Err = LTOBackend->run(AddStream)) return Err; + std::string HostModuleTempFile; + bool ValidHostModule = writeHostModule(HostModuleTempFile); + // Reset the HostModule pointer. + HostModulePtr.reset(); + HostModule = nullptr; + // TODO: this is really redundant code. + if (ValidHostModule) { + auto HostLTO = createHostRPCLTO(HostTriple); + + std::string HostBitCodeFile = HostModuleTempFile + ".bc"; + auto BufferOrError = MemoryBuffer::getFile(HostBitCodeFile); + if (!BufferOrError) + reportError(createFileError(HostBitCodeFile, BufferOrError.getError())); + Expected> BitcodeFileOrErr = + llvm::lto::InputFile::create(*BufferOrError.get()); + if (!BitcodeFileOrErr) + return BitcodeFileOrErr.takeError(); + + const auto Symbols = (*BitcodeFileOrErr)->symbols(); + SmallVector Resolutions(Symbols.size()); + size_t Idx = 0; + for (auto &Sym : Symbols) { + (void)Sym; + lto::SymbolResolution &Res = Resolutions[Idx++]; + Res.ExportDynamic = true; + Res.VisibleToRegularObj = true; + Res.LinkerRedefined = false; + Res.Prevailing = true; + } + if (Error Err = HostLTO->add(std::move(*BitcodeFileOrErr), Resolutions)) + return Err; + + auto RPCAddStream = + [&](size_t Task, + const Twine &ModuleName) -> std::unique_ptr { + int FD = -1; + auto TempFileOrErr = createOutputFile( + sys::path::filename(ExecutableName) + "-host-rpc-" + HostTriple, "o"); + if (!TempFileOrErr) + reportError(TempFileOrErr.takeError()); + HostRPCObjFile = *TempFileOrErr; + if (std::error_code EC = sys::fs::openFileForWrite(*TempFileOrErr, FD)) + reportError(errorCodeToError(EC)); + return std::make_unique( + std::make_unique(FD, true)); + }; + + if (Error Err = HostLTO->run(RPCAddStream)) + return Err; + } + if (LTOError) return createStringError(inconvertibleErrorCode(), "Errors encountered inside the LTO pipeline."); @@ -1245,6 +1394,9 @@ Expected> linkAndWrapDeviceFiles( WrappedOutput.push_back(*OutputOrErr); } + if (!HostRPCObjFile.empty()) + WrappedOutput.push_back(HostRPCObjFile); + return WrappedOutput; } diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h index ccf8e727c4045..811ca72b576e6 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPDeviceConstants.h @@ -25,6 +25,19 @@ enum OMPTgtExecModeFlags : unsigned char { OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD }; +enum OMPTgtHostRPCArgType { + // No need to copy. + OMP_HOST_RPC_ARG_SCALAR = 0, + OMP_HOST_RPC_ARG_PTR = 1, + // Copy to device. + OMP_HOST_RPC_ARG_COPY_TO = OMP_HOST_RPC_ARG_PTR | (1 << 1), + // Copy to device. + OMP_HOST_RPC_ARG_COPY_FROM = OMP_HOST_RPC_ARG_PTR | (1 << 2), + // Copy to and from device. + OMP_HOST_RPC_ARG_COPY_TOFROM = + OMP_HOST_RPC_ARG_COPY_TO | OMP_HOST_RPC_ARG_COPY_FROM, +}; + } // end namespace omp } // end namespace llvm diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index d22d2a8e948b0..90d5d19739674 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -227,7 +227,9 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, ) __OMP_RTL(__kmpc_get_warp_size, false, Int32, ) __OMP_RTL(omp_get_thread_num, false, Int32, ) +__OMP_RTL(omp_get_bulk_thread_num, false, Int32, ) __OMP_RTL(omp_get_num_threads, false, Int32, ) +__OMP_RTL(omp_get_bulk_num_threads, false, Int32, ) __OMP_RTL(omp_get_max_threads, false, Int32, ) __OMP_RTL(omp_in_parallel, false, Int32, ) __OMP_RTL(omp_get_dynamic, false, Int32, ) @@ -490,6 +492,8 @@ __OMP_RTL(__kmpc_reduction_get_fixed_buffer, false, VoidPtr, ) __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16) +__OMP_RTL(malloc, false, VoidPtr, SizeTy) +__OMP_RTL(free, false, Void, VoidPtr) __OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy) __OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr, SizeTy) __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy) @@ -503,6 +507,9 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,) __OMP_RTL(__kmpc_syncwarp, false, Void, Int64) +__OMP_RTL(__kmpc_launch_parallel_51_kernel, false, Void, Int8Ptr, Int32, Int32, + Int32, VoidPtrPtr, Int64) + __OMP_RTL(__last, false, Void, ) #undef __OMP_RTL @@ -710,6 +717,8 @@ __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs()) +__OMP_RTL_ATTRS(omp_get_bulk_thread_num, GetterAttrs, SExt, ParamAttrs()) +__OMP_RTL_ATTRS(omp_get_bulk_num_threads, GetterAttrs, SExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_get_dynamic, GetterAttrs, SExt, ParamAttrs()) diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h index 482b6e55a19d3..fe70dce082e1d 100644 --- a/llvm/include/llvm/LTO/Config.h +++ b/llvm/include/llvm/LTO/Config.h @@ -60,6 +60,9 @@ struct Config { bool VerifyEach = false; bool DisableVerify = false; + /// Use the standard optimization pipeline. + bool UseDefaultPipeline = false; + /// Flag to indicate that the optimizer should not assume builtins are present /// on the target. bool Freestanding = false; diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index 94996ae89e35d..431ef3a09e07a 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -303,6 +303,9 @@ class LTO { /// by LTO but might not be visible from bitcode symbol table. static ArrayRef getRuntimeLibcallSymbols(); + /// Returns the context. + LLVMContext &getContext() { return RegularLTO.Ctx; } + private: Config Conf;