diff --git a/llvm/utils/lit/lit/run.py b/llvm/utils/lit/lit/run.py index 62070e824e87f..ea280e3ce3521 100644 --- a/llvm/utils/lit/lit/run.py +++ b/llvm/utils/lit/lit/run.py @@ -7,6 +7,14 @@ import lit.util import lit.worker +# Windows has a limit of 60 workers per pool. +# This is defined in the multiprocessing module implementation. +# See: https://github.com/python/cpython/blob/6bc65c30ff1fd0b581a2c93416496fc720bc442c/Lib/concurrent/futures/process.py#L669-L672 +WINDOWS_MAX_WORKERS_PER_POOL = 60 + + +def _ceilDiv(a, b): + return (a + b - 1) // b class MaxFailuresError(Exception): pass @@ -72,25 +80,65 @@ def _execute(self, deadline): if v is not None } - pool = multiprocessing.Pool( - self.workers, lit.worker.initialize, (self.lit_config, semaphores) + # Windows has a limit of 60 workers per pool, so we need to use multiple pools + # if we have more workers requested than the limit. + # Also, allow to override the limit with the LIT_WINDOWS_MAX_WORKERS_PER_POOL environment variable. + max_workers_per_pool = ( + WINDOWS_MAX_WORKERS_PER_POOL if os.name == "nt" else self.workers + ) + max_workers_per_pool = int( + os.getenv("LIT_WINDOWS_MAX_WORKERS_PER_POOL", max_workers_per_pool) ) - async_results = [ - pool.apply_async( - lit.worker.execute, args=[test], callback=self.progress_callback + num_pools = max(1, _ceilDiv(self.workers, max_workers_per_pool)) + + # Distribute self.workers across num_pools as evenly as possible + workers_per_pool_list = [self.workers // num_pools] * num_pools + for pool_idx in range(self.workers % num_pools): + workers_per_pool_list[pool_idx] += 1 + + if num_pools > 1: + self.lit_config.note( + "Using %d pools balancing %d workers total distributed as %s (Windows worker limit workaround)" + % (num_pools, self.workers, workers_per_pool_list) ) - for test in self.tests - ] - pool.close() + + # Create multiple pools + pools = [] + for pool_size in workers_per_pool_list: + pool = multiprocessing.Pool( + pool_size, lit.worker.initialize, (self.lit_config, semaphores) + ) + pools.append(pool) + + # Distribute tests across pools + tests_per_pool = _ceilDiv(len(self.tests), num_pools) + async_results = [] + + for pool_idx, pool in enumerate(pools): + start_idx = pool_idx * tests_per_pool + end_idx = min(start_idx + tests_per_pool, len(self.tests)) + for test in self.tests[start_idx:end_idx]: + ar = pool.apply_async( + lit.worker.execute, args=[test], callback=self.progress_callback + ) + async_results.append(ar) + + # Close all pools + for pool in pools: + pool.close() try: self._wait_for(async_results, deadline) except: - pool.terminate() + # Terminate all pools on exception + for pool in pools: + pool.terminate() raise finally: - pool.join() + # Join all pools + for pool in pools: + pool.join() def _wait_for(self, async_results, deadline): timeout = deadline - time.time() diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py index ce4c3c2df3436..518c1a3029b86 100644 --- a/llvm/utils/lit/lit/util.py +++ b/llvm/utils/lit/lit/util.py @@ -113,11 +113,6 @@ def usable_core_count(): except AttributeError: n = os.cpu_count() or 1 - # On Windows with more than 60 processes, multiprocessing's call to - # _winapi.WaitForMultipleObjects() prints an error and lit hangs. - if platform.system() == "Windows": - return min(n, 60) - return n def abs_path_preserve_drive(path): diff --git a/llvm/utils/lit/tests/windows-pools.py b/llvm/utils/lit/tests/windows-pools.py new file mode 100644 index 0000000000000..67dd852955248 --- /dev/null +++ b/llvm/utils/lit/tests/windows-pools.py @@ -0,0 +1,27 @@ +# Create a directory with 20 files and check the number of pools and workers per pool that lit will use. + +# RUN: rm -Rf %t.dir && mkdir -p %t.dir +# RUN: python -c "for i in range(20): open(f'%t.dir/file{i}.txt', 'w').write('RUN:')" + +# RUN: echo "import lit.formats" > %t.dir/lit.cfg +# RUN: echo "config.name = \"top-level-suite\"" >> %t.dir/lit.cfg +# RUN: echo "config.suffixes = [\".txt\"]" >> %t.dir/lit.cfg +# RUN: echo "config.test_format = lit.formats.ShTest()" >> %t.dir/lit.cfg + + +# 15 workers per pool max, 100 workers total max: we expect lit to cap the workers to the number of files +# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=15" %{lit} -s %t.dir/ -j100 > %t.out 2>&1 +# CHECK: Using 2 pools balancing 20 workers total distributed as [10, 10] +# CHECK: Passed: 20 + +# 5 workers per pool max, 17 workers total max +# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=5" %{lit} -s %t.dir/ -j17 >> %t.out 2>&1 +# CHECK: Using 4 pools balancing 17 workers total distributed as [5, 4, 4, 4] +# CHECK: Passed: 20 + +# 19 workers per pool max, 19 workers total max +# RUN: env "LIT_WINDOWS_MAX_WORKERS_PER_POOL=19" %{lit} -s %t.dir/ -j19 >> %t.out 2>&1 +# CHECK-NOT: workers total distributed as +# CHECK: Passed: 20 + +# RUN: cat %t.out | FileCheck %s