Unrevert #8093 and fix jdeps parsing. (#8125)

### Problem Previous jdeps PR (#8093) was reverted due to a bug where regex parsing of jdeps output failed. ### Solution Mitigate root cause where incorrect arguments were passed to jdeps when targets had no deps with classpaths. Add extra layer of safety checking when matching individual lines of the summary output. ### Result dep-usage task functions more reliably.
pantsbuild · Jul 30, 2019 · 9004bdd · 9004bdd
1 parent ea100e2
commit 9004bdd
Show file tree

Hide file tree

Showing 5 changed files with 182 additions and 145 deletions.
diff --git a/src/python/pants/backend/jvm/tasks/analysis_extraction.py b/src/python/pants/backend/jvm/tasks/analysis_extraction.py
@@ -1,17 +1,20 @@
 # Copyright 2014 Pants project contributors (see CONTRIBUTORS.md).
 # Licensed under the Apache License, Version 2.0 (see LICENSE).
 
+import io
 import json
 import os
-from collections import defaultdict
+import re
+import subprocess
+from contextlib import contextmanager
 
 from pants.backend.jvm.subsystems.dependency_context import DependencyContext
 from pants.backend.jvm.subsystems.zinc import Zinc
 from pants.backend.jvm.tasks.nailgun_task import NailgunTask
-from pants.base.build_environment import get_buildroot
-from pants.base.exceptions import TaskError
-from pants.base.workunit import WorkUnitLabel
-from pants.goal.products import MultipleRootedProducts
+from pants.base.deprecated import deprecated_conditional
+from pants.java.distribution.distribution import DistributionLocator
+from pants.java.executor import SubprocessExecutor
+from pants.util.contextutil import temporary_dir
 from pants.util.memo import memoized_property
 
 
@@ -33,12 +36,11 @@ def register_options(cls, register):
   @classmethod
   def prepare(cls, options, round_manager):
     super().prepare(options, round_manager)
-    round_manager.require_data('zinc_analysis')
     round_manager.require_data('runtime_classpath')
 
   @classmethod
   def product_types(cls):
-    return ['classes_by_source', 'product_deps_by_src']
+    return ['product_deps_by_target', 'classes_by_source', 'product_deps_by_src']
 
   def _create_products_if_should_run(self):
     """If this task should run, initialize empty products that it will populate.
@@ -47,106 +49,130 @@ def _create_products_if_should_run(self):
     """
 
     should_run = False
-    if self.context.products.is_required_data('classes_by_source'):
+    if self.context.products.is_required_data('product_deps_by_target'):
       should_run = True
-      make_products = lambda: defaultdict(MultipleRootedProducts)
-      self.context.products.safe_create_data('classes_by_source', make_products)
-
-    if self.context.products.is_required_data('product_deps_by_src'):
-      should_run = True
-      self.context.products.safe_create_data('product_deps_by_src', dict)
+      self.context.products.safe_create_data('product_deps_by_target', dict)
     return should_run
 
   @memoized_property
   def _zinc(self):
     return Zinc.Factory.global_instance().create(self.context.products, self.get_options().execution_strategy)
 
-  def _summary_json_file(self, vt):
-    return os.path.join(vt.results_dir, 'summary.json')
+  def _jdeps_output_json(self, vt):
+    return os.path.join(vt.results_dir, 'jdeps_output.json')
 
-  @memoized_property
-  def _analysis_by_runtime_entry(self):
-    zinc_analysis = self.context.products.get_data('zinc_analysis')
-    return {cp_entry: analysis_file for _, cp_entry, analysis_file in zinc_analysis.values()}
+  @contextmanager
+  def aliased_classpaths(self, classpaths):
+    """
+    Create unique names for each classpath entry as symlinks in
+    a temporary directory.  returns: dict[str -> classpath entry]
+    which maps string paths of symlinks to classpaths.
+
+    ClasspathEntries generally point to a .jar of
+    the .class files generated for java_library targets.
+    These jars all have the same basename, z.jar, which
+    confuses the `jdeps` tool. Jdeps expects unique, and
+    descriptive, basenames for jars. When all basenames are
+    the same the deps collide in the jdeps output, some
+    .class files can't be found and the summary output
+    is not complete.
+    """
+    with temporary_dir() as tempdir:
+      aliases = {}
+      for i, cp in enumerate(classpaths):
+        alias = os.path.join(tempdir, f"{i}.jar" if not os.path.isdir(cp) else f"{i}")
+        os.symlink(cp, alias)
+        aliases[alias] = cp
+      yield aliases
 
   def execute(self):
     # If none of our computed products are necessary, return immediately.
+    deprecated_conditional(
+      lambda: self.context.products.is_required_data('classes_by_source'),
+      '1.20.0.dev2',
+      'The `classes_by_source` product depends on internal compiler details and is no longer produced.'
+    )
+    deprecated_conditional(
+      lambda: self.context.products.is_required_data('product_deps_by_src'),
+      '1.20.0.dev2',
+      'The `product_deps_by_src` product depends on internal compiler details and is no longer produced. '
+      'For similar functionality consume `product_deps_by_target`.'
+    )
+
     if not self._create_products_if_should_run():
       return
 
-    zinc_analysis = self.context.products.get_data('zinc_analysis')
     classpath_product = self.context.products.get_data('runtime_classpath')
-    classes_by_source = self.context.products.get_data('classes_by_source')
-    product_deps_by_src = self.context.products.get_data('product_deps_by_src')
+    product_deps_by_target = self.context.products.get_data('product_deps_by_target')
 
     fingerprint_strategy = DependencyContext.global_instance().create_fingerprint_strategy(
         classpath_product)
 
-    targets = list(zinc_analysis.keys())
+    # classpath fingerprint strategy only works on targets with a classpath.
+    targets = [target for target in self.context.targets() if hasattr(target, 'strict_deps')]
     with self.invalidated(targets,
                           fingerprint_strategy=fingerprint_strategy,
                           invalidate_dependents=True) as invalidation_check:
-      # Extract and parse products for any relevant targets.
       for vt in invalidation_check.all_vts:
-        summary_json_file = self._summary_json_file(vt)
-        cp_entry, _, analysis_file = zinc_analysis[vt.target]
+        # A list of class paths to the artifacts created by the target we are computing deps for.
+        target_artifact_classpaths = [path for _, path in classpath_product.get_for_target(vt.target)]
+        potential_deps_classpaths = self._zinc.compile_classpath('runtime_classpath', vt.target)
+
+        jdeps_output_json = self._jdeps_output_json(vt)
         if not vt.valid:
-          self._extract_analysis(vt.target, analysis_file, summary_json_file)
+          self._run_jdeps_analysis(
+            vt.target,
+            target_artifact_classpaths,
+            potential_deps_classpaths,
+            jdeps_output_json
+          )
         self._register_products(vt.target,
-                                cp_entry,
-                                summary_json_file,
-                                classes_by_source,
-                                product_deps_by_src)
-
-  def _extract_analysis(self, target, analysis_file, summary_json_file):
-    target_classpath = self._zinc.compile_classpath('runtime_classpath', target)
-    analysis_by_cp_entry = self._analysis_by_runtime_entry
-    upstream_analysis = list(self._upstream_analysis(target_classpath, analysis_by_cp_entry))
-    args = [
-        '-summary-json', summary_json_file,
-        '-analysis-cache', analysis_file,
-        '-classpath', ':'.join(target_classpath),
-        '-analysis-map', ','.join('{}:{}'.format(k, v) for k, v in upstream_analysis),
-      ]
-
-    result = self.runjava(classpath=self._zinc.extractor,
-                          main=Zinc.ZINC_EXTRACT_MAIN,
-                          args=args,
-                          workunit_name=Zinc.ZINC_EXTRACTOR_TOOL_NAME,
-                          workunit_labels=[WorkUnitLabel.MULTITOOL])
-    if result != 0:
-      raise TaskError('Failed to parse analysis for {}'.format(target.address.spec),
-                      exit_code=result)
-
-  def _upstream_analysis(self, target_classpath, analysis_by_cp_entry):
-    for entry in target_classpath:
-      analysis_file = analysis_by_cp_entry.get(entry)
-      if analysis_file is not None:
-        yield entry, analysis_file
+                                jdeps_output_json,
+                                product_deps_by_target)
+
+  @memoized_property
+  def _jdeps_summary_line_regex(self):
+    return re.compile(r"^.+\s->\s(.+)$")
+
+  def _run_jdeps_analysis(self, target, target_artifact_classpaths, potential_deps_classpaths, jdeps_output_json):
+    with self.aliased_classpaths(potential_deps_classpaths) as classpaths_by_alias:
+      with open(jdeps_output_json, 'w') as f:
+        if len(target_artifact_classpaths):
+          jdeps_stdout, jdeps_stderr = self._spawn_jdeps_command(
+            target, target_artifact_classpaths, classpaths_by_alias.keys()
+          ).communicate()
+          deps_classpaths = set()
+          for line in io.StringIO(jdeps_stdout.decode('utf-8')):
+            match = self._jdeps_summary_line_regex.fullmatch(line.strip())
+            if match is not None:
+              dep_name = match.group(1)
+              deps_classpaths.add(classpaths_by_alias.get(dep_name, dep_name))
+
+        else:
+          deps_classpaths = []
+        json.dump(list(deps_classpaths), f)
+
+  def _spawn_jdeps_command(self, target, target_artifact_classpaths, potential_deps_classpaths):
+    jdk = DistributionLocator.cached(jdk=True)
+    tool_classpath = jdk.find_libs(['tools.jar'])
+    potential_deps_classpath = ":".join(cp for cp in potential_deps_classpaths)
+
+    args = ["-summary"]
+    if potential_deps_classpath:
+      args.extend(['-classpath', potential_deps_classpath])
+
+    args.extend(target_artifact_classpaths)
+
+    java_executor = SubprocessExecutor(jdk)
+    return java_executor.spawn(classpath=tool_classpath,
+                               main='com.sun.tools.jdeps.Main',
+                               jvm_options=self.get_options().jvm_options,
+                               args=args,
+                               stdout=subprocess.PIPE)
 
   def _register_products(self,
                          target,
-                         target_cp_entry,
-                         summary_json_file,
-                         classes_by_source,
-                         product_deps_by_src):
-    summary_json = self._parse_summary_json(summary_json_file)
-
-    # Register a mapping between sources and classfiles (if requested).
-    if classes_by_source is not None:
-      buildroot = get_buildroot()
-      for abs_src, classes in summary_json['products'].items():
-        source = os.path.relpath(os.path.normpath(abs_src), buildroot)
-        classes = [os.path.normpath(c) for c in classes]
-        classes_by_source[source].add_abs_paths(target_cp_entry, classes)
-
-    # Register classfile product dependencies (if requested).
-    if product_deps_by_src is not None:
-      product_deps_by_src[target] = {
-          os.path.normpath(src): [os.path.normpath(dep) for dep in deps]
-          for src, deps in summary_json['dependencies'].items()
-        }
-
-  def _parse_summary_json(self, summary_json_file):
-    with open(summary_json_file, 'r') as f:
-      return json.load(f, encoding='utf-8')
+                         jdeps_output_json,
+                         product_deps_by_target):
+    with open(jdeps_output_json) as f:
+      product_deps_by_target[target] = json.load(f)
diff --git a/src/python/pants/backend/jvm/tasks/jvm_dependency_check.py b/src/python/pants/backend/jvm/tasks/jvm_dependency_check.py
@@ -64,9 +64,8 @@ def _skip(options):
   def prepare(cls, options, round_manager):
     super().prepare(options, round_manager)
     if not cls._skip(options):
-      round_manager.require_data('product_deps_by_src')
+      round_manager.require_data('product_deps_by_target')
       round_manager.require_data('runtime_classpath')
-      round_manager.require_data('zinc_analysis')
 
   def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
@@ -102,15 +101,15 @@ def execute(self):
     fingerprint_strategy = DependencyContext.global_instance().create_fingerprint_strategy(
         classpath_product)
 
-    targets = list(self.context.products.get_data('zinc_analysis').keys())
+    targets = [target for target in self.context.targets() if hasattr(target, 'strict_deps')]
 
     with self.invalidated(targets,
                           fingerprint_strategy=fingerprint_strategy,
                           invalidate_dependents=True) as invalidation_check:
       for vt in invalidation_check.invalid_vts:
-        product_deps_by_src = self.context.products.get_data('product_deps_by_src').get(vt.target)
-        if product_deps_by_src is not None:
-          self.check(vt.target, product_deps_by_src)
+        product_deps_for_target = self.context.products.get_data('product_deps_by_target').get(vt.target)
+        if product_deps_for_target is not None:
+          self.check(vt.target, product_deps_for_target)
 
   def check(self, src_tgt, actual_deps):
     """Check for missing deps.
@@ -130,16 +129,19 @@ def shorten(path):  # Make the output easier to read.
 
       def filter_whitelisted(missing_deps):
         # Removing any targets that exist in the whitelist from the list of dependency issues.
-        return [(tgt_pair, evidence) for (tgt_pair, evidence) in missing_deps
-                            if tgt_pair[0].address not in self._target_whitelist]
+        return [
+          (tgt_pair, evidence)
+          for (tgt_pair, evidence) in missing_deps
+          if tgt_pair[0].address not in self._target_whitelist
+        ]
 
       missing_direct_tgt_deps = filter_whitelisted(missing_direct_tgt_deps)
 
       if self._check_missing_direct_deps and missing_direct_tgt_deps:
         log_fn = (self.context.log.error if self._check_missing_direct_deps == 'fatal'
                   else self.context.log.warn)
         for (tgt_pair, evidence) in missing_direct_tgt_deps:
-          evidence_str = '\n'.join(['  {} uses {}'.format(shorten(e[0]), shorten(e[1]))
+          evidence_str = '\n'.join(['  {} uses {}'.format(e[0].address.spec, shorten(e[1]))
                                     for e in evidence])
           log_fn('Missing direct BUILD dependency {} -> {} because:\n{}'
                  .format(tgt_pair[0].address.spec, tgt_pair[1].address.spec, evidence_str))
@@ -180,6 +182,7 @@ def _compute_missing_deps(self, src_tgt, actual_deps):
     All paths in the input and output are absolute.
     """
     analyzer = self._analyzer
+
     def must_be_explicit_dep(dep):
       # We don't require explicit deps on the java runtime, so we shouldn't consider that
       # a missing dep.
@@ -204,22 +207,23 @@ def target_or_java_dep_in_targets(target, targets):
     missing_direct_tgt_deps_map = defaultdict(list)  # The same, but for direct deps.
 
     targets_by_file = analyzer.targets_by_file(self.context.targets())
-    buildroot = get_buildroot()
-    abs_srcs = [os.path.join(buildroot, src) for src in src_tgt.sources_relative_to_buildroot()]
-    for src in abs_srcs:
-      for actual_dep in filter(must_be_explicit_dep, actual_deps.get(src, [])):
-        actual_dep_tgts = targets_by_file.get(actual_dep)
-        # actual_dep_tgts is usually a singleton. If it's not, we only need one of these
-        # to be in our declared deps to be OK.
-        if actual_dep_tgts is None:
-          missing_file_deps.add((src_tgt, actual_dep))
-        elif not target_or_java_dep_in_targets(src_tgt, actual_dep_tgts):
-          # Obviously intra-target deps are fine.
-          canonical_actual_dep_tgt = next(iter(actual_dep_tgts))
-          if canonical_actual_dep_tgt not in src_tgt.dependencies:
-            # The canonical dep is the only one a direct dependency makes sense on.
-            missing_direct_tgt_deps_map[(src_tgt, canonical_actual_dep_tgt)].append(
-                (src, actual_dep))
+    for actual_dep in filter(must_be_explicit_dep, actual_deps):
+      actual_dep_tgts = targets_by_file.get(actual_dep)
+      # actual_dep_tgts is usually a singleton. If it's not, we only need one of these
+      # to be in our declared deps to be OK.
+      if actual_dep_tgts is None:
+        missing_file_deps.add((src_tgt, actual_dep))
+      elif not target_or_java_dep_in_targets(src_tgt, actual_dep_tgts):
+        # Obviously intra-target deps are fine.
+        canonical_actual_dep_tgt = next(iter(actual_dep_tgts))
+        if canonical_actual_dep_tgt not in src_tgt.dependencies:
+          # The canonical dep is the only one a direct dependency makes sense on.
+          # TODO get rid of src usage here. we dont have a way to map class
+          # files back to source files when using jdeps. I think we can get away without
+          # listing the src file directly and just list the target which has the transient
+          # dep
+          missing_direct_tgt_deps_map[(src_tgt, canonical_actual_dep_tgt)].append(
+              (src_tgt, actual_dep))
 
     return (list(missing_file_deps),
             list(missing_direct_tgt_deps_map.items()))
@@ -254,6 +258,8 @@ def _compute_unnecessary_deps(self, target, actual_deps):
     """
     # Flatten the product deps of this target.
     product_deps = set()
+    # TODO update actual deps will just be a list, not a dict when switching to
+    # product_deps_by_target_product.
     for dep_entries in actual_deps.values():
       product_deps.update(dep_entries)