Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use jdeps to run dep-usage task #8093

Merged
merged 5 commits into from
Jul 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 103 additions & 81 deletions src/python/pants/backend/jvm/tasks/analysis_extraction.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
# Copyright 2014 Pants project contributors (see CONTRIBUTORS.md).
# Licensed under the Apache License, Version 2.0 (see LICENSE).

import io
import json
import os
from collections import defaultdict
import re
import subprocess
from contextlib import contextmanager

from pants.backend.jvm.subsystems.dependency_context import DependencyContext
from pants.backend.jvm.subsystems.zinc import Zinc
from pants.backend.jvm.tasks.nailgun_task import NailgunTask
from pants.base.build_environment import get_buildroot
from pants.base.exceptions import TaskError
from pants.base.workunit import WorkUnitLabel
from pants.goal.products import MultipleRootedProducts
from pants.base.deprecated import deprecated_conditional
from pants.java.distribution.distribution import DistributionLocator
from pants.java.executor import SubprocessExecutor
from pants.util.contextutil import temporary_dir
from pants.util.memo import memoized_property


Expand All @@ -33,12 +36,11 @@ def register_options(cls, register):
@classmethod
def prepare(cls, options, round_manager):
super().prepare(options, round_manager)
round_manager.require_data('zinc_analysis')
round_manager.require_data('runtime_classpath')

@classmethod
def product_types(cls):
return ['classes_by_source', 'product_deps_by_src']
return ['product_deps_by_target', 'classes_by_source', 'product_deps_by_src']

def _create_products_if_should_run(self):
"""If this task should run, initialize empty products that it will populate.
Expand All @@ -47,106 +49,126 @@ def _create_products_if_should_run(self):
"""

should_run = False
if self.context.products.is_required_data('classes_by_source'):
if self.context.products.is_required_data('product_deps_by_target'):
should_run = True
make_products = lambda: defaultdict(MultipleRootedProducts)
self.context.products.safe_create_data('classes_by_source', make_products)

if self.context.products.is_required_data('product_deps_by_src'):
should_run = True
self.context.products.safe_create_data('product_deps_by_src', dict)
self.context.products.safe_create_data('product_deps_by_target', dict)
return should_run

@memoized_property
def _zinc(self):
return Zinc.Factory.global_instance().create(self.context.products, self.get_options().execution_strategy)

def _summary_json_file(self, vt):
return os.path.join(vt.results_dir, 'summary.json')
def _jdeps_output_json(self, vt):
return os.path.join(vt.results_dir, 'jdeps_output.json')

@memoized_property
def _analysis_by_runtime_entry(self):
zinc_analysis = self.context.products.get_data('zinc_analysis')
return {cp_entry: analysis_file for _, cp_entry, analysis_file in zinc_analysis.values()}
@contextmanager
def aliased_classpaths(self, classpaths):
"""
stuhood marked this conversation as resolved.
Show resolved Hide resolved
Create unique names for each classpath entry as symlinks in
a temporary directory. returns: dict[str -> classpath entry]
which maps string paths of symlinks to classpaths.

ClasspathEntries generally point to a .jar of
the .class files generated for java_library targets.
These jars all have the same basename, z.jar, which
confuses the `jdeps` tool. Jdeps expects unique, and
descriptive, basenames for jars. When all basenames are
the same the deps collide in the jdeps output, some
.class files can't be found and the summary output
is not complete.
"""
with temporary_dir() as tempdir:
aliases = {}
for i, cp in enumerate(classpaths):
alias = os.path.join(tempdir, f"{i}.jar" if not os.path.isdir(cp) else f"{i}")
os.symlink(cp, alias)
aliases[alias] = cp
yield aliases

def execute(self):
# If none of our computed products are necessary, return immediately.
deprecated_conditional(
lambda: self.context.products.is_required_data('classes_by_source'),
'1.20.0.dev2',
'The `classes_by_source` product depends on internal compiler details and is no longer produced.'
)
deprecated_conditional(
lambda: self.context.products.is_required_data('product_deps_by_src'),
'1.20.0.dev2',
'The `product_deps_by_src` product depends on internal compiler details and is no longer produced. '
'For similar functionality consume `product_deps_by_target`.'
)

if not self._create_products_if_should_run():
return

zinc_analysis = self.context.products.get_data('zinc_analysis')
classpath_product = self.context.products.get_data('runtime_classpath')
classes_by_source = self.context.products.get_data('classes_by_source')
product_deps_by_src = self.context.products.get_data('product_deps_by_src')
product_deps_by_target = self.context.products.get_data('product_deps_by_target')

fingerprint_strategy = DependencyContext.global_instance().create_fingerprint_strategy(
classpath_product)

targets = list(zinc_analysis.keys())
# classpath fingerprint strategy only works on targets with a classpath.
targets = [target for target in self.context.targets() if hasattr(target, 'strict_deps')]
with self.invalidated(targets,
fingerprint_strategy=fingerprint_strategy,
invalidate_dependents=True) as invalidation_check:
# Extract and parse products for any relevant targets.
for vt in invalidation_check.all_vts:
summary_json_file = self._summary_json_file(vt)
cp_entry, _, analysis_file = zinc_analysis[vt.target]
# A list of class paths to the artifacts created by the target we are computing deps for.
target_artifact_classpaths = [path for _, path in classpath_product.get_for_target(vt.target)]
potential_deps_classpaths = self._zinc.compile_classpath('runtime_classpath', vt.target)

jdeps_output_json = self._jdeps_output_json(vt)
if not vt.valid:
self._extract_analysis(vt.target, analysis_file, summary_json_file)
self._run_jdeps_analysis(
vt.target,
target_artifact_classpaths,
potential_deps_classpaths,
jdeps_output_json
)
self._register_products(vt.target,
cp_entry,
summary_json_file,
classes_by_source,
product_deps_by_src)

def _extract_analysis(self, target, analysis_file, summary_json_file):
target_classpath = self._zinc.compile_classpath('runtime_classpath', target)
analysis_by_cp_entry = self._analysis_by_runtime_entry
upstream_analysis = list(self._upstream_analysis(target_classpath, analysis_by_cp_entry))
jdeps_output_json,
product_deps_by_target)

@memoized_property
def _jdeps_summary_line_regex(self):
return re.compile(r"^.+\s->\s(.+)$")

def _run_jdeps_analysis(self, target, target_artifact_classpaths, potential_deps_classpaths, jdeps_output_json):
with self.aliased_classpaths(potential_deps_classpaths) as classpaths_by_alias:
Copy link
Sponsor Member

@stuhood stuhood Jul 24, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, I think rather than passing in this entire collection, we should only be aliasing the valid classpath entries for the target. This is a correctness issue because as much as we try to avoid it, it is possible for multiple classpath entries to provide a particular class, and that would represent a weird false positive here.

In this case, that would look like using the Zinc subsystem to compute the deps per-target we are going to invoke jdeps on: https://github.com/pantsbuild/pants/blob/master/src/python/pants/backend/jvm/subsystems/zinc.py#L436

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand the false positive issue, but isn't the point of this task to remove reliance on zinc? Maybe we could sort the classpath entries so the ones that come from explicit deps of the target, and their transients, are resolved first? One benefit of using all the classpaths is you can find out if some deps are being used implicitly.

Copy link
Sponsor Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The goal is to remove reliance on usage of the zinc tool: but this classpath-computation codepath is our code, and will likely just be renamed at some point.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I must be missing something. If we compute the deps directly from the dependency context, what new information do we learn by apply jdeps to a target? It seems like the only thing we can figure out about a target is whether it has missing or extra deps. Maybe I just answered my own question but I had assumed a big part of what we wanted was to resolve what those extra undeclared deps were, which we can't do unless they are on the classpath.

Copy link
Sponsor Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The dependency context computes deps based on what is declared in BUILD files (ie, declared target dependencies). jdeps computes actual usage of deps (at the classlevel). It's entirely possible to declare a dependency in your BUILD file without using it, and that's exactly what the ./pants dep-usage task is supposed to help you fix.

Copy link
Sponsor Member

@stuhood stuhood Jul 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And with regard to undeclared deps: those are only possible in the case of strict_deps: False, and it refers to things that are in your transitive dependencies (ie, they were passed to the compiler), but which were not listed in your declared dependencies (ie, in your BUILD file).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I preemptively made this change because I figured there was a good explanation. So when CI is green again we should be good to ship. Also, noted that the main purpose of dep-usage in unused deps. The jvm-dep-check linting task on the other hand does compare the actual deps with declared deps. Based on the strict_dependencies docs it sounds like transient strict deps that are "exported" are captured in the dependent targets list of the root. If that isn't true then the dep check could fail in some places. It would manifest as a classpath called not found being unresolvable to a target.

with open(jdeps_output_json, 'w') as f:
if len(target_artifact_classpaths):
jdeps_stdout, jdeps_stderr = self._spawn_jdeps_command(
target, target_artifact_classpaths, classpaths_by_alias.keys()
).communicate()
deps_classpaths = set()
for line in io.StringIO(jdeps_stdout.decode('utf-8')):
match = self._jdeps_summary_line_regex.fullmatch(line.strip()).group(1)
deps_classpaths.add(classpaths_by_alias.get(match, match))

else:
deps_classpaths = []
json.dump(list(deps_classpaths), f)

def _spawn_jdeps_command(self, target, target_artifact_classpaths, potential_deps_classpaths):
jdk = DistributionLocator.cached(jdk=True)
tool_classpath = jdk.find_libs(['tools.jar'])

args = [
'-summary-json', summary_json_file,
'-analysis-cache', analysis_file,
'-classpath', ':'.join(target_classpath),
'-analysis-map', ','.join('{}:{}'.format(k, v) for k, v in upstream_analysis),
]

result = self.runjava(classpath=self._zinc.extractor,
main=Zinc.ZINC_EXTRACT_MAIN,
args=args,
workunit_name=Zinc.ZINC_EXTRACTOR_TOOL_NAME,
workunit_labels=[WorkUnitLabel.MULTITOOL])
if result != 0:
raise TaskError('Failed to parse analysis for {}'.format(target.address.spec),
exit_code=result)

def _upstream_analysis(self, target_classpath, analysis_by_cp_entry):
for entry in target_classpath:
analysis_file = analysis_by_cp_entry.get(entry)
if analysis_file is not None:
yield entry, analysis_file
"-summary",
'-classpath', ":".join(cp for cp in potential_deps_classpaths),
] + target_artifact_classpaths

java_executor = SubprocessExecutor(jdk)
return java_executor.spawn(classpath=tool_classpath,
main='com.sun.tools.jdeps.Main',
jvm_options=self.get_options().jvm_options,
args=args,
stdout=subprocess.PIPE)

def _register_products(self,
target,
target_cp_entry,
summary_json_file,
classes_by_source,
product_deps_by_src):
summary_json = self._parse_summary_json(summary_json_file)

# Register a mapping between sources and classfiles (if requested).
if classes_by_source is not None:
buildroot = get_buildroot()
for abs_src, classes in summary_json['products'].items():
source = os.path.relpath(os.path.normpath(abs_src), buildroot)
classes = [os.path.normpath(c) for c in classes]
classes_by_source[source].add_abs_paths(target_cp_entry, classes)

# Register classfile product dependencies (if requested).
if product_deps_by_src is not None:
product_deps_by_src[target] = {
os.path.normpath(src): [os.path.normpath(dep) for dep in deps]
for src, deps in summary_json['dependencies'].items()
}

def _parse_summary_json(self, summary_json_file):
with open(summary_json_file, 'r') as f:
return json.load(f, encoding='utf-8')
jdeps_output_json,
product_deps_by_target):
with open(jdeps_output_json) as f:
product_deps_by_target[target] = json.load(f)
56 changes: 31 additions & 25 deletions src/python/pants/backend/jvm/tasks/jvm_dependency_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,8 @@ def _skip(options):
def prepare(cls, options, round_manager):
super().prepare(options, round_manager)
if not cls._skip(options):
round_manager.require_data('product_deps_by_src')
round_manager.require_data('product_deps_by_target')
round_manager.require_data('runtime_classpath')
round_manager.require_data('zinc_analysis')

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -102,15 +101,15 @@ def execute(self):
fingerprint_strategy = DependencyContext.global_instance().create_fingerprint_strategy(
classpath_product)

targets = list(self.context.products.get_data('zinc_analysis').keys())
targets = [target for target in self.context.targets() if hasattr(target, 'strict_deps')]

with self.invalidated(targets,
fingerprint_strategy=fingerprint_strategy,
invalidate_dependents=True) as invalidation_check:
for vt in invalidation_check.invalid_vts:
product_deps_by_src = self.context.products.get_data('product_deps_by_src').get(vt.target)
if product_deps_by_src is not None:
self.check(vt.target, product_deps_by_src)
product_deps_for_target = self.context.products.get_data('product_deps_by_target').get(vt.target)
if product_deps_for_target is not None:
self.check(vt.target, product_deps_for_target)

def check(self, src_tgt, actual_deps):
"""Check for missing deps.
Expand All @@ -130,16 +129,19 @@ def shorten(path): # Make the output easier to read.

def filter_whitelisted(missing_deps):
# Removing any targets that exist in the whitelist from the list of dependency issues.
return [(tgt_pair, evidence) for (tgt_pair, evidence) in missing_deps
if tgt_pair[0].address not in self._target_whitelist]
return [
(tgt_pair, evidence)
for (tgt_pair, evidence) in missing_deps
if tgt_pair[0].address not in self._target_whitelist
]

missing_direct_tgt_deps = filter_whitelisted(missing_direct_tgt_deps)

if self._check_missing_direct_deps and missing_direct_tgt_deps:
log_fn = (self.context.log.error if self._check_missing_direct_deps == 'fatal'
else self.context.log.warn)
for (tgt_pair, evidence) in missing_direct_tgt_deps:
evidence_str = '\n'.join([' {} uses {}'.format(shorten(e[0]), shorten(e[1]))
evidence_str = '\n'.join([' {} uses {}'.format(e[0].address.spec, shorten(e[1]))
for e in evidence])
log_fn('Missing direct BUILD dependency {} -> {} because:\n{}'
.format(tgt_pair[0].address.spec, tgt_pair[1].address.spec, evidence_str))
Expand Down Expand Up @@ -180,6 +182,7 @@ def _compute_missing_deps(self, src_tgt, actual_deps):
All paths in the input and output are absolute.
"""
analyzer = self._analyzer

def must_be_explicit_dep(dep):
# We don't require explicit deps on the java runtime, so we shouldn't consider that
# a missing dep.
Expand All @@ -204,22 +207,23 @@ def target_or_java_dep_in_targets(target, targets):
missing_direct_tgt_deps_map = defaultdict(list) # The same, but for direct deps.

targets_by_file = analyzer.targets_by_file(self.context.targets())
buildroot = get_buildroot()
abs_srcs = [os.path.join(buildroot, src) for src in src_tgt.sources_relative_to_buildroot()]
for src in abs_srcs:
for actual_dep in filter(must_be_explicit_dep, actual_deps.get(src, [])):
actual_dep_tgts = targets_by_file.get(actual_dep)
# actual_dep_tgts is usually a singleton. If it's not, we only need one of these
# to be in our declared deps to be OK.
if actual_dep_tgts is None:
missing_file_deps.add((src_tgt, actual_dep))
elif not target_or_java_dep_in_targets(src_tgt, actual_dep_tgts):
# Obviously intra-target deps are fine.
canonical_actual_dep_tgt = next(iter(actual_dep_tgts))
if canonical_actual_dep_tgt not in src_tgt.dependencies:
# The canonical dep is the only one a direct dependency makes sense on.
missing_direct_tgt_deps_map[(src_tgt, canonical_actual_dep_tgt)].append(
(src, actual_dep))
for actual_dep in filter(must_be_explicit_dep, actual_deps):
actual_dep_tgts = targets_by_file.get(actual_dep)
# actual_dep_tgts is usually a singleton. If it's not, we only need one of these
# to be in our declared deps to be OK.
if actual_dep_tgts is None:
missing_file_deps.add((src_tgt, actual_dep))
elif not target_or_java_dep_in_targets(src_tgt, actual_dep_tgts):
# Obviously intra-target deps are fine.
canonical_actual_dep_tgt = next(iter(actual_dep_tgts))
if canonical_actual_dep_tgt not in src_tgt.dependencies:
# The canonical dep is the only one a direct dependency makes sense on.
# TODO get rid of src usage here. we dont have a way to map class
# files back to source files when using jdeps. I think we can get away without
# listing the src file directly and just list the target which has the transient
# dep
missing_direct_tgt_deps_map[(src_tgt, canonical_actual_dep_tgt)].append(
(src_tgt, actual_dep))

return (list(missing_file_deps),
list(missing_direct_tgt_deps_map.items()))
Expand Down Expand Up @@ -254,6 +258,8 @@ def _compute_unnecessary_deps(self, target, actual_deps):
"""
# Flatten the product deps of this target.
product_deps = set()
# TODO update actual deps will just be a list, not a dict when switching to
# product_deps_by_target_product.
for dep_entries in actual_deps.values():
product_deps.update(dep_entries)

Expand Down
Loading