From ef35675b483f574577bad19e0786274c22e1fb44 Mon Sep 17 00:00:00 2001
From: Ned Batchelder <ned@nedbatchelder.com>
Date: Mon, 7 Nov 2022 16:11:26 -0500
Subject: [PATCH] perf: hash data files during combining to avoid unneeded
 work. #1483

When generating many parallel data files, often some data files will be exact
copies of each other.  Checking the hashes, we can avoid combining the
duplicates, speeding the process.
---
 coverage/data.py          | 30 ++++++++++++++++++++++--------
 coverage/sqldata.py       |  2 +-
 tests/test_api.py         |  2 +-
 tests/test_concurrency.py |  5 ++++-
 4 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/coverage/data.py b/coverage/data.py
index 4bdfe3010..1a8f35d0c 100644
--- a/coverage/data.py
+++ b/coverage/data.py
@@ -11,6 +11,7 @@
 """
 
 import glob
+import hashlib
 import os.path
 
 from coverage.exceptions import CoverageException, NoDataError
@@ -110,6 +111,7 @@ def combine_parallel_data(
     if strict and not files_to_combine:
         raise NoDataError("No data to combine")
 
+    file_hashes = set()
     files_combined = 0
     for f in files_to_combine:
         if f == data.data_filename():
@@ -118,6 +120,25 @@ def combine_parallel_data(
             if data._debug.should('dataio'):
                 data._debug.write(f"Skipping combining ourself: {f!r}")
             continue
+
+        try:
+            rel_file_name = os.path.relpath(f)
+        except ValueError:
+            # ValueError can be raised under Windows when os.getcwd() returns a
+            # folder from a different drive than the drive of f, in which case
+            # we print the original value of f instead of its relative path
+            rel_file_name = f
+
+        with open(f, "rb") as fobj:
+            hasher = hashlib.new("sha3_256")
+            hasher.update(fobj.read())
+            sha = hasher.digest()
+            if sha in file_hashes:
+                if message:
+                    message(f"Skipping duplicate data {rel_file_name}")
+                continue
+            file_hashes.add(sha)
+
         if data._debug.should('dataio'):
             data._debug.write(f"Combining data file {f!r}")
         try:
@@ -132,14 +153,7 @@ def combine_parallel_data(
             data.update(new_data, aliases=aliases)
             files_combined += 1
             if message:
-                try:
-                    file_name = os.path.relpath(f)
-                except ValueError:
-                    # ValueError can be raised under Windows when os.getcwd() returns a
-                    # folder from a different drive than the drive of f, in which case
-                    # we print the original value of f instead of its relative path
-                    file_name = f
-                message(f"Combined data file {file_name}")
+                message(f"Combined data file {rel_file_name}")
             if not keep:
                 if data._debug.should('dataio'):
                     data._debug.write(f"Deleting combined data file {f!r}")
diff --git a/coverage/sqldata.py b/coverage/sqldata.py
index 2b7730537..0aa67364f 100644
--- a/coverage/sqldata.py
+++ b/coverage/sqldata.py
@@ -305,7 +305,7 @@ def _init_db(self, db):
             [
                 ("sys_argv", str(getattr(sys, "argv", None))),
                 ("version", __version__),
-                ("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
+                #("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
             ]
         )
 
diff --git a/tests/test_api.py b/tests/test_api.py
index ce44b9b1c..195452323 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1362,7 +1362,7 @@ def test_combine_no_usable_files(self):
 
         # Make bogus data files.
         self.make_file(".coverage.bad1", "This isn't a coverage data file.")
-        self.make_file(".coverage.bad2", "This isn't a coverage data file.")
+        self.make_file(".coverage.bad2", "This isn't a coverage data file either.")
 
         # Combine the parallel coverage data files into .coverage, but nothing is readable.
         cov = coverage.Coverage()
diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py
index 0a51d4d96..30dae136c 100644
--- a/tests/test_concurrency.py
+++ b/tests/test_concurrency.py
@@ -484,7 +484,10 @@ def try_multiprocessing_code(
             out_lines = out.splitlines()
             assert len(out_lines) == nprocs + 1
             assert all(
-                re.fullmatch(r"Combined data file \.coverage\..*\.\d+\.\d+", line)
+                re.fullmatch(
+                    r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.\d+",
+                    line
+                )
                 for line in out_lines
             )
             out = self.run_command("coverage report -m")