69 changes: 55 additions & 14 deletions llvm/include/llvm/Support/Threading.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#ifndef LLVM_SUPPORT_THREADING_H
#define LLVM_SUPPORT_THREADING_H

#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/FunctionExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
Expand Down Expand Up @@ -143,20 +144,52 @@ void llvm_execute_on_thread_async(
#endif
}

/// Get the amount of currency to use for tasks requiring significant
/// memory or other resources. Currently based on physical cores, if
/// available for the host system, otherwise falls back to
/// thread::hardware_concurrency().
/// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF
unsigned heavyweight_hardware_concurrency();

/// Get the number of threads that the current program can execute
/// concurrently. On some systems std::thread::hardware_concurrency() returns
/// the total number of cores, without taking affinity into consideration.
/// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF.
/// Fallback to std::thread::hardware_concurrency() if sched_getaffinity is
/// not available.
unsigned hardware_concurrency();
/// This tells how a thread pool will be used
class ThreadPoolStrategy {
public:
// The default value (0) means all available threads should be used,
// excluding affinity mask. If set, this value only represents a suggested
// high bound, the runtime might choose a lower value (not higher).
unsigned ThreadsRequested = 0;

// If SMT is active, use hyper threads. If false, there will be only one
// std::thread per core.
bool UseHyperThreads = true;

/// Retrieves the max available threads for the current strategy. This
/// accounts for affinity masks and takes advantage of all CPU sockets.
unsigned compute_thread_count() const;

/// Assign the current thread to an ideal hardware CPU or NUMA node. In a
/// multi-socket system, this ensures threads are assigned to all CPU
/// sockets. \p ThreadPoolNum represents a number bounded by [0,
/// compute_thread_count()).
void apply_thread_strategy(unsigned ThreadPoolNum) const;
};

/// Returns a thread strategy for tasks requiring significant memory or other
/// resources. To be used for workloads where hardware_concurrency() proves to
/// be less efficient. Avoid this strategy if doing lots of I/O. Currently
/// based on physical cores, if available for the host system, otherwise falls
/// back to hardware_concurrency(). Returns 1 when LLVM is configured with
/// LLVM_ENABLE_THREADS = OFF
inline ThreadPoolStrategy
heavyweight_hardware_concurrency(unsigned ThreadCount = 0) {
ThreadPoolStrategy S;
S.UseHyperThreads = false;
S.ThreadsRequested = ThreadCount;
return S;
}

/// Returns a default thread strategy where all available hardware ressources
/// are to be used, except for those initially excluded by an affinity mask.
/// This function takes affinity into consideration. Returns 1 when LLVM is
/// configured with LLVM_ENABLE_THREADS=OFF.
inline ThreadPoolStrategy hardware_concurrency(unsigned ThreadCount = 0) {
ThreadPoolStrategy S;
S.ThreadsRequested = ThreadCount;
return S;
}

/// Return the current thread id, as used in various OS system calls.
/// Note that not all platforms guarantee that the value returned will be
Expand Down Expand Up @@ -184,6 +217,14 @@ void llvm_execute_on_thread_async(
/// the operation succeeded or failed is returned.
void get_thread_name(SmallVectorImpl<char> &Name);

/// Returns a mask that represents on which hardware thread, core, CPU, NUMA
/// group, the calling thread can be executed. On Windows, threads cannot
/// cross CPU boundaries.
llvm::BitVector get_thread_affinity_mask();

/// Returns how many physical CPUs or NUMA groups the system has.
unsigned get_cpus();

enum class ThreadPriority {
Background = 0,
Default = 1,
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/ParallelCG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ std::unique_ptr<Module> llvm::splitCodeGen(
// Create ThreadPool in nested scope so that threads will be joined
// on destruction.
{
ThreadPool CodegenThreadPool(OSs.size());
ThreadPool CodegenThreadPool(hardware_concurrency(OSs.size()));
int ThreadCount = 0;

SplitModule(
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/DWARFLinker/DWARFLinker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2446,7 +2446,7 @@ bool DWARFLinker::link() {
}
EmitLambda();
} else {
ThreadPool Pool(2);
ThreadPool Pool(hardware_concurrency(2));
Pool.async(AnalyzeAll);
Pool.async(CloneAll);
Pool.wait();
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ Error DwarfTransformer::convert(uint32_t NumThreads) {

// Now parse all DIEs in case we have cross compile unit references in a
// thread pool.
ThreadPool pool(NumThreads);
ThreadPool pool(hardware_concurrency(NumThreads));
for (const auto &CU : DICtx.compile_units())
pool.async([&CU]() { CU->getUnitDIE(false /*CUDieOnly*/); });
pool.wait();
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ LLJIT::LLJIT(LLJITBuilderState &S, Error &Err)

if (S.NumCompileThreads > 0) {
TransformLayer->setCloneToNewContextOnEmit(true);
CompileThreads = std::make_unique<ThreadPool>(S.NumCompileThreads);
CompileThreads =
std::make_unique<ThreadPool>(hardware_concurrency(S.NumCompileThreads));
ES->setDispatchMaterialization(
[this](JITDylib &JD, std::unique_ptr<MaterializationUnit> MU) {
// FIXME: Switch to move capture once we have c++14.
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/LTO/LTO.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -477,8 +477,7 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
: Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
if (!Backend)
this->Backend =
createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
this->Backend = createInProcessThinBackend();
}

LTO::LTO(Config Conf, ThinBackend Backend,
Expand Down Expand Up @@ -1095,7 +1094,8 @@ class InProcessThinBackend : public ThinBackendProc {
const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
AddStreamFn AddStream, NativeObjectCache Cache)
: ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
BackendThreadPool(ThinLTOParallelismLevel),
BackendThreadPool(
heavyweight_hardware_concurrency(ThinLTOParallelismLevel)),
AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
for (auto &Name : CombinedIndex.cfiFunctionDefs())
CfiFunctionDefs.insert(
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/LTO/LTOBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,8 @@ void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
void splitCodeGen(const Config &C, TargetMachine *TM, AddStreamFn AddStream,
unsigned ParallelCodeGenParallelismLevel,
std::unique_ptr<Module> Mod) {
ThreadPool CodegenThreadPool(ParallelCodeGenParallelismLevel);
ThreadPool CodegenThreadPool(
heavyweight_hardware_concurrency(ParallelCodeGenParallelismLevel));
unsigned ThreadCount = 0;
const Target *T = &TM->getTarget();

Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/LTO/ThinLTOCodeGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ extern cl::opt<std::string> RemarksFormat;

namespace {

static cl::opt<int>
ThreadCount("threads", cl::init(llvm::heavyweight_hardware_concurrency()));
// Default to using one job per hardware core in the system
static cl::opt<int> ThreadCount("threads", cl::init(0));

// Simple helper to save temporary files for debug.
static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
Expand Down Expand Up @@ -1042,7 +1042,7 @@ void ThinLTOCodeGenerator::run() {

// Parallel optimizer + codegen
{
ThreadPool Pool(ThreadCount);
ThreadPool Pool(heavyweight_hardware_concurrency(ThreadCount));
for (auto IndexCount : ModulesOrdering) {
auto &Mod = Modules[IndexCount];
Pool.async([&](int count) {
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Support/Host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1266,7 +1266,7 @@ StringRef sys::getHostCPUName() { return "generic"; }
// On Linux, the number of physical cores can be computed from /proc/cpuinfo,
// using the number of unique physical/core id pairs. The following
// implementation reads the /proc/cpuinfo format on an x86_64 system.
static int computeHostNumPhysicalCores() {
int computeHostNumPhysicalCores() {
// Read /proc/cpuinfo as a stream (until EOF reached). It cannot be
// mmapped because it appears to have 0 size.
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
Expand Down Expand Up @@ -1312,7 +1312,7 @@ static int computeHostNumPhysicalCores() {
#include <sys/sysctl.h>

// Gets the number of *physical cores* on the machine.
static int computeHostNumPhysicalCores() {
int computeHostNumPhysicalCores() {
uint32_t count;
size_t len = sizeof(count);
sysctlbyname("hw.physicalcpu", &count, &len, NULL, 0);
Expand All @@ -1326,6 +1326,9 @@ static int computeHostNumPhysicalCores() {
}
return count;
}
#elif defined(_WIN32)
// Defined in llvm/lib/Support/Windows/Threading.inc
int computeHostNumPhysicalCores();
#else
// On other systems, return -1 to indicate unknown.
static int computeHostNumPhysicalCores() { return -1; }
Expand Down
14 changes: 8 additions & 6 deletions llvm/lib/Support/Parallel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,21 @@ class Executor {
/// in filo order.
class ThreadPoolExecutor : public Executor {
public:
explicit ThreadPoolExecutor(unsigned ThreadCount = hardware_concurrency()) {
explicit ThreadPoolExecutor(ThreadPoolStrategy S = hardware_concurrency()) {
unsigned ThreadCount = S.compute_thread_count();
// Spawn all but one of the threads in another thread as spawning threads
// can take a while.
Threads.reserve(ThreadCount);
Threads.resize(1);
std::lock_guard<std::mutex> Lock(Mutex);
Threads[0] = std::thread([&, ThreadCount] {
for (unsigned i = 1; i < ThreadCount; ++i) {
Threads.emplace_back([=] { work(); });
Threads[0] = std::thread([this, ThreadCount, S] {
for (unsigned I = 1; I < ThreadCount; ++I) {
Threads.emplace_back([=] { work(S, I); });
if (Stop)
break;
}
ThreadsCreated.set_value();
work();
work(S, 0);
});
}

Expand Down Expand Up @@ -90,7 +91,8 @@ class ThreadPoolExecutor : public Executor {
}

private:
void work() {
void work(ThreadPoolStrategy S, unsigned ThreadID) {
S.apply_thread_strategy(ThreadID);
while (true) {
std::unique_lock<std::mutex> Lock(Mutex);
Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
Expand Down
23 changes: 9 additions & 14 deletions llvm/lib/Support/ThreadPool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,15 @@ using namespace llvm;

#if LLVM_ENABLE_THREADS

// Default to hardware_concurrency
ThreadPool::ThreadPool() : ThreadPool(hardware_concurrency()) {}

ThreadPool::ThreadPool(unsigned ThreadCount)
: ActiveThreads(0), EnableFlag(true) {
ThreadPool::ThreadPool(ThreadPoolStrategy S)
: ActiveThreads(0), EnableFlag(true),
ThreadCount(S.compute_thread_count()) {
// Create ThreadCount threads that will loop forever, wait on QueueCondition
// for tasks to be queued or the Pool to be destroyed.
Threads.reserve(ThreadCount);
for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) {
Threads.emplace_back([&] {
Threads.emplace_back([S, ThreadID, this] {
S.apply_thread_strategy(ThreadID);
while (true) {
PackagedTaskTy Task;
{
Expand Down Expand Up @@ -108,12 +107,10 @@ ThreadPool::~ThreadPool() {

#else // LLVM_ENABLE_THREADS Disabled

ThreadPool::ThreadPool() : ThreadPool(0) {}

// No threads are launched, issue a warning if ThreadCount is not 0
ThreadPool::ThreadPool(unsigned ThreadCount)
: ActiveThreads(0) {
if (ThreadCount) {
ThreadPool::ThreadPool(ThreadPoolStrategy S)
: ActiveThreads(0), ThreadCount(S.compute_thread_count()) {
if (ThreadCount != 1) {
errs() << "Warning: request a ThreadPool with " << ThreadCount
<< " threads, but LLVM_ENABLE_THREADS has been turned off\n";
}
Expand All @@ -138,8 +135,6 @@ std::shared_future<void> ThreadPool::asyncImpl(TaskTy Task) {
return Future;
}

ThreadPool::~ThreadPool() {
wait();
}
ThreadPool::~ThreadPool() { wait(); }

#endif
46 changes: 19 additions & 27 deletions llvm/lib/Support/Threading.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,6 @@ void llvm::llvm_execute_on_thread(void (*Fn)(void *), void *UserData,
Fn(UserData);
}

unsigned llvm::heavyweight_hardware_concurrency() { return 1; }

unsigned llvm::hardware_concurrency() { return 1; }

uint64_t llvm::get_threadid() { return 0; }

uint32_t llvm::get_max_thread_name_length() { return 0; }
Expand All @@ -57,6 +53,13 @@ void llvm::set_thread_name(const Twine &Name) {}

void llvm::get_thread_name(SmallVectorImpl<char> &Name) { Name.clear(); }

llvm::BitVector llvm::get_thread_affinity_mask() { return {}; }

unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
// When threads are disabled, ensure clients will loop at least once.
return 1;
}

#if LLVM_ENABLE_THREADS == 0
void llvm::llvm_execute_on_thread_async(
llvm::unique_function<void()> Func,
Expand All @@ -78,30 +81,19 @@ void llvm::llvm_execute_on_thread_async(

#else

#include <thread>
unsigned llvm::heavyweight_hardware_concurrency() {
// Since we can't get here unless LLVM_ENABLE_THREADS == 1, it is safe to use
// `std::thread` directly instead of `llvm::thread` (and indeed, doing so
// allows us to not define `thread` in the llvm namespace, which conflicts
// with some platforms such as FreeBSD whose headers also define a struct
// called `thread` in the global namespace which can cause ambiguity due to
// ADL.
int NumPhysical = sys::getHostNumPhysicalCores();
if (NumPhysical == -1)
return std::thread::hardware_concurrency();
return NumPhysical;
}
int computeHostNumHardwareThreads();

unsigned llvm::hardware_concurrency() {
#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
cpu_set_t Set;
if (sched_getaffinity(0, sizeof(Set), &Set))
return CPU_COUNT(&Set);
#endif
// Guard against std::thread::hardware_concurrency() returning 0.
if (unsigned Val = std::thread::hardware_concurrency())
return Val;
return 1;
unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads()
: sys::getHostNumPhysicalCores();
if (MaxThreadCount <= 0)
MaxThreadCount = 1;

// No need to create more threads than there are hardware threads, it would
// uselessly induce more context-switching and cache eviction.
if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount)
return MaxThreadCount;
return ThreadsRequested;
}

namespace {
Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/Support/Unix/Threading.inc
Original file line number Diff line number Diff line change
Expand Up @@ -267,3 +267,27 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
#endif
return SetThreadPriorityResult::FAILURE;
}

#include <thread>

int computeHostNumHardwareThreads() {
#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
cpu_set_t Set;
if (sched_getaffinity(0, sizeof(Set), &Set))
return CPU_COUNT(&Set);
#endif
// Guard against std::thread::hardware_concurrency() returning 0.
if (unsigned Val = std::thread::hardware_concurrency())
return Val;
return 1;
}

void llvm::ThreadPoolStrategy::apply_thread_strategy(
unsigned ThreadPoolNum) const {}

llvm::BitVector llvm::get_thread_affinity_mask() {
// FIXME: Implement
llvm_unreachable("Not implemented!");
}

unsigned llvm::get_cpus() { return 1; }
162 changes: 162 additions & 0 deletions llvm/lib/Support/Windows/Threading.inc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include "WindowsSupport.h"
#include <process.h>

#include <bitset>

// Windows will at times define MemoryFence.
#ifdef MemoryFence
#undef MemoryFence
Expand Down Expand Up @@ -122,3 +124,163 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
? SetThreadPriorityResult::SUCCESS
: SetThreadPriorityResult::FAILURE;
}

struct ProcessorGroup {
unsigned ID;
unsigned AllThreads;
unsigned UsableThreads;
unsigned ThreadsPerCore;
uint64_t Affinity;
};

template <typename F>
static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) {
DWORD Len = 0;
BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len);
if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
return false;
}
auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len);
R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len);
if (R) {
auto *End =
(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Info + Len);
for (auto *Curr = Info; Curr < End;
Curr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Curr +
Curr->Size)) {
if (Curr->Relationship != Relationship)
continue;
Fn(Curr);
}
}
free(Info);
return true;
}

static ArrayRef<ProcessorGroup> getProcessorGroups() {
auto computeGroups = []() {
SmallVector<ProcessorGroup, 4> Groups;

auto HandleGroup = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
GROUP_RELATIONSHIP &El = ProcInfo->Group;
for (unsigned J = 0; J < El.ActiveGroupCount; ++J) {
ProcessorGroup G;
G.ID = Groups.size();
G.AllThreads = El.GroupInfo[J].MaximumProcessorCount;
G.UsableThreads = El.GroupInfo[J].ActiveProcessorCount;
assert(G.UsableThreads <= 64);
G.Affinity = El.GroupInfo[J].ActiveProcessorMask;
Groups.push_back(G);
}
};

if (!IterateProcInfo(RelationGroup, HandleGroup))
return std::vector<ProcessorGroup>();

auto HandleProc = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
PROCESSOR_RELATIONSHIP &El = ProcInfo->Processor;
assert(El.GroupCount == 1);
unsigned NumHyperThreads = 1;
// If the flag is set, each core supports more than one hyper-thread.
if (El.Flags & LTP_PC_SMT)
NumHyperThreads = std::bitset<64>(El.GroupMask[0].Mask).count();
unsigned I = El.GroupMask[0].Group;
Groups[I].ThreadsPerCore = NumHyperThreads;
};

if (!IterateProcInfo(RelationProcessorCore, HandleProc))
return std::vector<ProcessorGroup>();

// If there's an affinity mask set on one of the CPUs, then assume the user
// wants to constrain the current process to only a single CPU.
for (auto &G : Groups) {
if (G.UsableThreads != G.AllThreads) {
ProcessorGroup NewG{G};
Groups.clear();
Groups.push_back(NewG);
break;
}
}

return std::vector<ProcessorGroup>(Groups.begin(), Groups.end());
};
static auto Groups = computeGroups();
return ArrayRef<ProcessorGroup>(Groups);
}

template <typename R, typename UnaryPredicate>
static unsigned aggregate(R &&Range, UnaryPredicate P) {
unsigned I{};
for (const auto &It : Range)
I += P(It);
return I;
}

// for sys::getHostNumPhysicalCores
int computeHostNumPhysicalCores() {
static unsigned Cores =
aggregate(getProcessorGroups(), [](const ProcessorGroup &G) {
return G.UsableThreads / G.ThreadsPerCore;
});
return Cores;
}

int computeHostNumHardwareThreads() {
static unsigned Threads =
aggregate(getProcessorGroups(),
[](const ProcessorGroup &G) { return G.UsableThreads; });
return Threads;
}

// Assign the current thread to a more appropriate CPU socket or CPU group
void llvm::ThreadPoolStrategy::apply_thread_strategy(
unsigned ThreadPoolNum) const {
ArrayRef<ProcessorGroup> Groups = getProcessorGroups();

assert(ThreadPoolNum < compute_thread_count() &&
"The thread index is not within thread strategy's range!");

// In this mode, the ThreadNumber represents the core number, not the
// hyper-thread number. Assumes all NUMA groups have the same amount of
// hyper-threads.
if (!UseHyperThreads)
ThreadPoolNum *= Groups[0].ThreadsPerCore;

unsigned ThreadRangeStart = 0;
for (unsigned I = 0; I < Groups.size(); ++I) {
const ProcessorGroup &G = Groups[I];
if (ThreadPoolNum >= ThreadRangeStart &&
ThreadPoolNum < ThreadRangeStart + G.UsableThreads) {

GROUP_AFFINITY Affinity{};
Affinity.Group = G.ID;
Affinity.Mask = G.Affinity;
SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
}
ThreadRangeStart += G.UsableThreads;
}
}

llvm::BitVector llvm::get_thread_affinity_mask() {
GROUP_AFFINITY Affinity{};
GetThreadGroupAffinity(GetCurrentThread(), &Affinity);

static unsigned All =
aggregate(getProcessorGroups(),
[](const ProcessorGroup &G) { return G.AllThreads; });

unsigned StartOffset =
aggregate(getProcessorGroups(), [&](const ProcessorGroup &G) {
return G.ID < Affinity.Group ? G.AllThreads : 0;
});

llvm::BitVector V;
V.resize(All);
for (unsigned I = 0; I < sizeof(KAFFINITY) * 8; ++I) {
if ((Affinity.Mask >> I) & 1)
V.set(StartOffset + I);
}
return V;
}

unsigned llvm::get_cpus() { return getProcessorGroups().size(); }
9 changes: 5 additions & 4 deletions llvm/tools/dsymutil/dsymutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) {
if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads))
Options.LinkOpts.Threads = atoi(NumThreads->getValue());
else
Options.LinkOpts.Threads = thread::hardware_concurrency();
Options.LinkOpts.Threads = 0; // Use all available hardware threads

if (Options.DumpDebugMap || Options.LinkOpts.Verbose)
Options.LinkOpts.Threads = 1;
Expand Down Expand Up @@ -541,9 +541,10 @@ int main(int argc, char **argv) {
// Shared a single binary holder for all the link steps.
BinaryHolder BinHolder;

unsigned ThreadCount =
std::min<unsigned>(Options.LinkOpts.Threads, DebugMapPtrsOrErr->size());
ThreadPool Threads(ThreadCount);
unsigned ThreadCount = Options.LinkOpts.Threads;
if (!ThreadCount)
ThreadCount = DebugMapPtrsOrErr->size();
ThreadPool Threads(hardware_concurrency(ThreadCount));

// If there is more than one link to execute, we need to generate
// temporary files.
Expand Down
4 changes: 2 additions & 2 deletions llvm/tools/gold/gold-plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ namespace options {
static unsigned OptLevel = 2;
// Default parallelism of 0 used to indicate that user did not specify.
// Actual parallelism default value depends on implementation.
// Currently only affects ThinLTO, where the default is
// llvm::heavyweight_hardware_concurrency.
// Currently only affects ThinLTO, where the default is the max cores in the
// system.
static unsigned Parallelism = 0;
// Default regular LTO codegen parallelism (number of partitions).
static unsigned ParallelCodeGenParallelismLevel = 1;
Expand Down
6 changes: 2 additions & 4 deletions llvm/tools/llvm-cov/CodeCoverage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -947,17 +947,15 @@ int CodeCoverageTool::doShow(int argc, const char **argv,

// If NumThreads is not specified, auto-detect a good default.
if (NumThreads == 0)
NumThreads =
std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
unsigned(SourceFiles.size())));
NumThreads = SourceFiles.size();

if (!ViewOpts.hasOutputDirectory() || NumThreads == 1) {
for (const std::string &SourceFile : SourceFiles)
writeSourceFileView(SourceFile, Coverage.get(), Printer.get(),
ShowFilenames);
} else {
// In -output-dir mode, it's safe to use multiple threads to print files.
ThreadPool Pool(NumThreads);
ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));
for (const std::string &SourceFile : SourceFiles)
Pool.async(&CodeCoverageTool::writeSourceFileView, this, SourceFile,
Coverage.get(), Printer.get(), ShowFilenames);
Expand Down
8 changes: 3 additions & 5 deletions llvm/tools/llvm-cov/CoverageExporterJson.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,11 +163,9 @@ json::Array renderFiles(const coverage::CoverageMapping &Coverage,
ArrayRef<FileCoverageSummary> FileReports,
const CoverageViewOptions &Options) {
auto NumThreads = Options.NumThreads;
if (NumThreads == 0) {
NumThreads = std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
unsigned(SourceFiles.size())));
}
ThreadPool Pool(NumThreads);
if (NumThreads == 0)
NumThreads = SourceFiles.size();
ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));
json::Array FileArray;
std::mutex FileArrayMutex;

Expand Down
7 changes: 2 additions & 5 deletions llvm/tools/llvm-cov/CoverageReport.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,11 +356,8 @@ std::vector<FileCoverageSummary> CoverageReport::prepareFileReports(

// If NumThreads is not specified, auto-detect a good default.
if (NumThreads == 0)
NumThreads =
std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
unsigned(Files.size())));

ThreadPool Pool(NumThreads);
NumThreads = Files.size();
ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));

std::vector<FileCoverageSummary> FileReports;
FileReports.reserve(Files.size());
Expand Down
4 changes: 2 additions & 2 deletions llvm/tools/llvm-lto2/llvm-lto2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ static cl::opt<bool>
"import files for the "
"distributed backend case"));

static cl::opt<int> Threads("thinlto-threads",
cl::init(llvm::heavyweight_hardware_concurrency()));
// Default to using all hardware cores in the system.
static cl::opt<int> Threads("thinlto-threads", cl::init(0));

static cl::list<std::string> SymbolResolutions(
"r",
Expand Down
9 changes: 6 additions & 3 deletions llvm/tools/llvm-profdata/llvm-profdata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,11 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,

// If NumThreads is not specified, auto-detect a good default.
if (NumThreads == 0)
NumThreads =
std::min(hardware_concurrency(), unsigned((Inputs.size() + 1) / 2));
NumThreads = std::min(hardware_concurrency().compute_thread_count(),
unsigned((Inputs.size() + 1) / 2));
// FIXME: There's a bug here, where setting NumThreads = Inputs.size() fails
// the merge_empty_profile.test because the InstrProfWriter.ProfileKind isn't
// merged, thus the emitted file ends up with a PF_Unknown kind.

// Initialize the writer contexts.
SmallVector<std::unique_ptr<WriterContext>, 4> Contexts;
Expand All @@ -320,7 +323,7 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
for (const auto &Input : Inputs)
loadInput(Input, Remapper, Contexts[0].get());
} else {
ThreadPool Pool(NumThreads);
ThreadPool Pool(hardware_concurrency(NumThreads));

// Load the inputs in parallel (N/NumThreads serial steps).
unsigned Ctx = 0;
Expand Down
35 changes: 35 additions & 0 deletions llvm/unittests/ADT/BitVectorTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//

#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallBitVector.h"
#include "gtest/gtest.h"

Expand Down Expand Up @@ -1149,4 +1150,38 @@ TYPED_TEST(BitVectorTest, PushBack) {
EXPECT_EQ(213U, Vec.size());
EXPECT_EQ(102U, Vec.count());
}

TYPED_TEST(BitVectorTest, DenseSet) {
DenseSet<TypeParam> Set;
TypeParam A(10, true);
auto I = Set.insert(A);
EXPECT_EQ(true, I.second);

TypeParam B(5, true);
I = Set.insert(B);
EXPECT_EQ(true, I.second);

TypeParam C(20, false);
C.set(19);
I = Set.insert(C);
EXPECT_EQ(true, I.second);

TypeParam D;
EXPECT_DEATH(Set.insert(D),
"Empty/Tombstone value shouldn't be inserted into map!");

EXPECT_EQ(3U, Set.size());
EXPECT_EQ(1U, Set.count(A));
EXPECT_EQ(1U, Set.count(B));
EXPECT_EQ(1U, Set.count(C));

EXPECT_EQ(true, Set.erase(B));
EXPECT_EQ(2U, Set.size());

EXPECT_EQ(true, Set.erase(C));
EXPECT_EQ(1U, Set.size());

EXPECT_EQ(true, Set.erase(A));
EXPECT_EQ(0U, Set.size());
}
} // namespace
3 changes: 2 additions & 1 deletion llvm/unittests/Support/Host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ class HostTest : public testing::Test {
// Initially this is only testing detection of the number of
// physical cores, which is currently only supported/tested for
// x86_64 Linux and Darwin.
return (Host.getArch() == Triple::x86_64 &&
return Host.isOSWindows() ||
(Host.getArch() == Triple::x86_64 &&
(Host.isOSDarwin() || Host.getOS() == Triple::Linux));
}

Expand Down
6 changes: 3 additions & 3 deletions llvm/unittests/Support/TaskQueueTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class TaskQueueTest : public testing::Test {
};

TEST_F(TaskQueueTest, OrderedFutures) {
ThreadPool TP(1);
ThreadPool TP(hardware_concurrency(1));
TaskQueue TQ(TP);
std::atomic<int> X{ 0 };
std::atomic<int> Y{ 0 };
Expand Down Expand Up @@ -66,7 +66,7 @@ TEST_F(TaskQueueTest, OrderedFutures) {
}

TEST_F(TaskQueueTest, UnOrderedFutures) {
ThreadPool TP(1);
ThreadPool TP(hardware_concurrency(1));
TaskQueue TQ(TP);
std::atomic<int> X{ 0 };
std::atomic<int> Y{ 0 };
Expand Down Expand Up @@ -96,7 +96,7 @@ TEST_F(TaskQueueTest, UnOrderedFutures) {
}

TEST_F(TaskQueueTest, FutureWithReturnValue) {
ThreadPool TP(1);
ThreadPool TP(hardware_concurrency(1));
TaskQueue TQ(TP);
std::future<std::string> F1 = TQ.async([&] { return std::string("Hello"); });
std::future<int> F2 = TQ.async([&] { return 42; });
Expand Down
48 changes: 47 additions & 1 deletion llvm/unittests/Support/ThreadPool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@

#include "llvm/Support/ThreadPool.h"

#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Support/Host.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/Threading.h"

#include "gtest/gtest.h"

Expand Down Expand Up @@ -69,6 +71,8 @@ class ThreadPoolTest : public testing::Test {

void SetUp() override { MainThreadReady = false; }

void TestAllThreads(ThreadPoolStrategy S);

std::condition_variable WaitMainThread;
std::mutex WaitMainThreadMutex;
bool MainThreadReady = false;
Expand Down Expand Up @@ -131,7 +135,7 @@ TEST_F(ThreadPoolTest, Async) {

TEST_F(ThreadPoolTest, GetFuture) {
CHECK_UNSUPPORTED();
ThreadPool Pool{2};
ThreadPool Pool(hardware_concurrency(2));
std::atomic_int i{0};
Pool.async([this, &i] {
waitForMainThread();
Expand Down Expand Up @@ -162,3 +166,45 @@ TEST_F(ThreadPoolTest, PoolDestruction) {
}
ASSERT_EQ(5, checked_in);
}

#if LLVM_ENABLE_THREADS == 1

void ThreadPoolTest::TestAllThreads(ThreadPoolStrategy S) {
// FIXME: Skip these tests on non-Windows because multi-socket system were not
// tested on Unix yet, and llvm::get_thread_affinity_mask() isn't implemented
// for Unix.
Triple Host(Triple::normalize(sys::getProcessTriple()));
if (!Host.isOSWindows())
return;

llvm::DenseSet<llvm::BitVector> ThreadsUsed;
std::mutex Lock;
unsigned Threads = 0;
{
ThreadPool Pool(S);
Threads = Pool.getThreadCount();
for (size_t I = 0; I < 10000; ++I) {
Pool.async([&] {
waitForMainThread();
std::lock_guard<std::mutex> Guard(Lock);
auto Mask = llvm::get_thread_affinity_mask();
ThreadsUsed.insert(Mask);
});
}
ASSERT_EQ(true, ThreadsUsed.empty());
setMainThreadReady();
}
ASSERT_EQ(llvm::get_cpus(), ThreadsUsed.size());
}

TEST_F(ThreadPoolTest, AllThreads_UseAllRessources) {
CHECK_UNSUPPORTED();
TestAllThreads({});
}

TEST_F(ThreadPoolTest, AllThreads_OneThreadPerCore) {
CHECK_UNSUPPORTED();
TestAllThreads(llvm::heavyweight_hardware_concurrency());
}

#endif
3 changes: 2 additions & 1 deletion llvm/unittests/Support/Threading.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ TEST(Threading, PhysicalConcurrency) {
auto Num = heavyweight_hardware_concurrency();
// Since Num is unsigned this will also catch us trying to
// return -1.
ASSERT_LE(Num, thread::hardware_concurrency());
ASSERT_LE(Num.compute_thread_count(),
hardware_concurrency().compute_thread_count());
}

#if LLVM_ENABLE_THREADS
Expand Down
3 changes: 2 additions & 1 deletion mlir/lib/Pass/Pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,8 @@ void OpToOpPassAdaptorParallel::runOnOperation() {
// Create the async executors if they haven't been created, or if the main
// pipeline has changed.
if (asyncExecutors.empty() || hasSizeMismatch(asyncExecutors.front(), mgrs))
asyncExecutors.assign(llvm::hardware_concurrency(), mgrs);
asyncExecutors.assign(llvm::hardware_concurrency().compute_thread_count(),
mgrs);

// Run a prepass over the module to collect the operations to execute over.
// This ensures that an analysis manager exists for each operation, as well as
Expand Down