From ef35675b483f574577bad19e0786274c22e1fb44 Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Mon, 7 Nov 2022 16:11:26 -0500 Subject: [PATCH] perf: hash data files during combining to avoid unneeded work. #1483 When generating many parallel data files, often some data files will be exact copies of each other. Checking the hashes, we can avoid combining the duplicates, speeding the process. --- coverage/data.py | 30 ++++++++++++++++++++++-------- coverage/sqldata.py | 2 +- tests/test_api.py | 2 +- tests/test_concurrency.py | 5 ++++- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/coverage/data.py b/coverage/data.py index 4bdfe3010..1a8f35d0c 100644 --- a/coverage/data.py +++ b/coverage/data.py @@ -11,6 +11,7 @@ """ import glob +import hashlib import os.path from coverage.exceptions import CoverageException, NoDataError @@ -110,6 +111,7 @@ def combine_parallel_data( if strict and not files_to_combine: raise NoDataError("No data to combine") + file_hashes = set() files_combined = 0 for f in files_to_combine: if f == data.data_filename(): @@ -118,6 +120,25 @@ def combine_parallel_data( if data._debug.should('dataio'): data._debug.write(f"Skipping combining ourself: {f!r}") continue + + try: + rel_file_name = os.path.relpath(f) + except ValueError: + # ValueError can be raised under Windows when os.getcwd() returns a + # folder from a different drive than the drive of f, in which case + # we print the original value of f instead of its relative path + rel_file_name = f + + with open(f, "rb") as fobj: + hasher = hashlib.new("sha3_256") + hasher.update(fobj.read()) + sha = hasher.digest() + if sha in file_hashes: + if message: + message(f"Skipping duplicate data {rel_file_name}") + continue + file_hashes.add(sha) + if data._debug.should('dataio'): data._debug.write(f"Combining data file {f!r}") try: @@ -132,14 +153,7 @@ def combine_parallel_data( data.update(new_data, aliases=aliases) files_combined += 1 if message: - try: - file_name = os.path.relpath(f) - except ValueError: - # ValueError can be raised under Windows when os.getcwd() returns a - # folder from a different drive than the drive of f, in which case - # we print the original value of f instead of its relative path - file_name = f - message(f"Combined data file {file_name}") + message(f"Combined data file {rel_file_name}") if not keep: if data._debug.should('dataio'): data._debug.write(f"Deleting combined data file {f!r}") diff --git a/coverage/sqldata.py b/coverage/sqldata.py index 2b7730537..0aa67364f 100644 --- a/coverage/sqldata.py +++ b/coverage/sqldata.py @@ -305,7 +305,7 @@ def _init_db(self, db): [ ("sys_argv", str(getattr(sys, "argv", None))), ("version", __version__), - ("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")), + #("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")), ] ) diff --git a/tests/test_api.py b/tests/test_api.py index ce44b9b1c..195452323 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1362,7 +1362,7 @@ def test_combine_no_usable_files(self): # Make bogus data files. self.make_file(".coverage.bad1", "This isn't a coverage data file.") - self.make_file(".coverage.bad2", "This isn't a coverage data file.") + self.make_file(".coverage.bad2", "This isn't a coverage data file either.") # Combine the parallel coverage data files into .coverage, but nothing is readable. cov = coverage.Coverage() diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py index 0a51d4d96..30dae136c 100644 --- a/tests/test_concurrency.py +++ b/tests/test_concurrency.py @@ -484,7 +484,10 @@ def try_multiprocessing_code( out_lines = out.splitlines() assert len(out_lines) == nprocs + 1 assert all( - re.fullmatch(r"Combined data file \.coverage\..*\.\d+\.\d+", line) + re.fullmatch( + r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.\d+", + line + ) for line in out_lines ) out = self.run_command("coverage report -m")