Skip to content

Commit

Permalink
use libCST to parse imports from python 3 code (#10907)
Browse files Browse the repository at this point in the history
### Problem

Fixes #10894.

### Solution

- Add `libcst>=0.3.12,<0.4` to `requirements.txt`.
- Remove the usage of the normal python 3 `ast` library from `import_parser.py`, and implement a `_CSTVisitor` to visit import nodes.
- When using python 3, instead use the `_CSTVisitor`.
  - We use python 3.8 to parse all python 3 code, which is 100% correct for the purpose of parsing imports.

### Result

We should no longer be dependent on the version of the current python executable to parse python code for dependencies!
  • Loading branch information
cosmicexplorer committed Oct 8, 2020
1 parent ff02ef0 commit ea542ac
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 64 deletions.
10 changes: 6 additions & 4 deletions 3rdparty/python/constraints.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by build-support/bin/generate_lockfile.sh on Mon Sep 28 09:49:00 PDT 2020
# Generated by build-support/bin/generate_lockfile.sh on Wed Oct 7 12:00:29 PDT 2020
ansicolors==1.1.8
attrs==20.2.0
beautifulsoup4==4.6.3
Expand All @@ -11,14 +11,15 @@ fasteners==0.15
idna==2.10
importlib-metadata==2.0.0
iniconfig==1.0.1
libcst==0.3.12
monotonic==1.5
more-itertools==8.5.0
mypy==0.782
mypy-extensions==0.4.3
packaging==20.4
pathspec==0.8.0
pex==2.1.16
pip==19.0.3
pex==2.1.17
pip==18.1
pluggy==0.13.1
psutil==5.7.0
py==1.9.0
Expand All @@ -35,6 +36,7 @@ six==1.15.0
toml==0.10.1
typed-ast==1.4.1
typing-extensions==3.7.4.2
typing-inspect==0.6.0
urllib3==1.25.10
www-authenticate==0.9.2
zipp==3.2.0
zipp==3.3.0
1 change: 1 addition & 0 deletions 3rdparty/python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ ansicolors==1.1.8
beautifulsoup4>=4.6.0,<4.7
dataclasses==0.6
fasteners==0.15.0
libcst>=0.3.12,<0.4

# The MyPy requirement should be maintained in lockstep with the requirement the Pants repo uses
# for the mypy task since it configures custom MyPy plugins. That requirement can be found via:
Expand Down
153 changes: 101 additions & 52 deletions src/python/pants/backend/python/dependency_inference/import_parser.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
# Copyright 2020 Pants project contributors (see CONTRIBUTORS.md).
# Licensed under the Apache License, Version 2.0 (see LICENSE).

import ast as ast3
import logging
import re
import sys
from dataclasses import dataclass
from typing import Optional, Set, Tuple
from typing import Optional, Set, Union

import libcst as cst
from typed_ast import ast27
from typing_extensions import Protocol

from pants.util.memo import memoized_property
from pants.util.ordered_set import FrozenOrderedSet
from pants.util.strutil import ensure_text


class ImportParseError(ValueError):
pass
logger = logging.getLogger(__name__)


@dataclass(frozen=True)
Expand All @@ -34,41 +33,56 @@ class ParsedPythonImports:
def all_imports(self) -> FrozenOrderedSet[str]:
return FrozenOrderedSet(sorted([*self.explicit_imports, *self.inferred_imports]))

@classmethod
def empty(cls) -> "ParsedPythonImports":
return cls(FrozenOrderedSet(), FrozenOrderedSet())


class VisitorInterface(Protocol):
explicit_imports: Set[str]
inferred_imports: Set[str]

def parse_file(*, filename: str, content: str) -> Optional[Tuple]:

def parse_file(*, filename: str, content: str, module_name: str) -> Optional[VisitorInterface]:
"""Parse the file for python imports, and return a visitor with the imports it found."""
# Parse all python 3 code with libCST. We parse assuming python 3 goes first, because we assume
# most user code will be python 3.
# TODO(#10921): identify the appropriate interpreter version beforehand!
try:
# NB: The Python 3 ast is generally backwards-compatible with earlier versions. The only
# breaking change is `async` `await` becoming reserved keywords in Python 3.7 (deprecated
# in 3.6). If the std-lib fails to parse, we could use typed-ast to try parsing with a
# target version of Python 3.5, but we don't because Python 3.5 is almost EOL and has very
# low usage.
# We will also fail to parse Python 3.8 syntax if Pants is run with Python 3.6 or 3.7.
# There is no known workaround for this, beyond users changing their `./pants` script to
# always use >= 3.8.
tree = ast3.parse(content, filename=filename)
visitor_cls = _Py3AstVisitor if sys.version_info[:2] < (3, 8) else _Py38AstVisitor
return tree, visitor_cls
except SyntaxError:
try:
py27_tree = ast27.parse(content, filename=filename)
return py27_tree, _Py27AstVisitor
except SyntaxError:
return None
# NB: Since all python 3 code is forwards-compatible with the 3.8 parser, and the import
# syntax remains unchanged, we are safely able to use the 3.8 parser for parsing imports.
# TODO(#10922): Support parsing python 3.9/3.10 with libCST!
config = cst.PartialParserConfig(python_version="3.8")
cst_tree = cst.parse_module(content, config=config)
completed_cst_visitor = _CSTVisitor.visit_tree(cst_tree, module_name=module_name)
return completed_cst_visitor
except cst.ParserSyntaxError as e:
# NB: When the python 3 ast visitor fails to parse python 2 syntax, it raises a
# ParserSyntaxError. This may also occur when the file contains invalid python code. If we
# successfully parse a python 2 file with a python 3 parser, that should not change the
# imports we calculate.
logger.debug(f"Failed to parse {filename} with python 3.8 libCST parser: {e}")

try:
py27_tree = ast27.parse(content, filename=filename)
completed_ast27_visitor = _Py27AstVisitor.visit_tree(py27_tree, module_name=module_name)
return completed_ast27_visitor
except SyntaxError as e:
logger.debug(f"Failed to parse {filename} with python 2.7 typed-ast parser: {e}")

return None


def find_python_imports(*, filename: str, content: str, module_name: str) -> ParsedPythonImports:
parse_result = parse_file(filename=filename, content=content)
completed_visitor = parse_file(filename=filename, content=content, module_name=module_name)
# If there were syntax errors, gracefully early return. This is more user friendly than
# propagating the exception. Dependency inference simply won't be used for that file, and
# it'll be up to the tool actually being run (e.g. Pytest or Flake8) to error.
if parse_result is None:
return ParsedPythonImports(FrozenOrderedSet(), FrozenOrderedSet())
tree, ast_visitor_cls = parse_result
ast_visitor = ast_visitor_cls(module_name)
ast_visitor.visit(tree)
if completed_visitor is None:
return ParsedPythonImports.empty()
return ParsedPythonImports(
explicit_imports=FrozenOrderedSet(sorted(ast_visitor.explicit_imports)),
inferred_imports=FrozenOrderedSet(sorted(ast_visitor.inferred_imports)),
explicit_imports=FrozenOrderedSet(sorted(completed_visitor.explicit_imports)),
inferred_imports=FrozenOrderedSet(sorted(completed_visitor.inferred_imports)),
)


Expand All @@ -77,42 +91,77 @@ def find_python_imports(*, filename: str, content: str, module_name: str) -> Par
_INFERRED_IMPORT_REGEX = re.compile(r"^([a-z_][a-z_\d]*\.){2,}[a-zA-Z_]\w*$")


class _BaseAstVisitor:
class _Py27AstVisitor(ast27.NodeVisitor):
def __init__(self, module_name: str) -> None:
self._module_parts = module_name.split(".")
self.explicit_imports: Set[str] = set()
self.inferred_imports: Set[str] = set()

def maybe_add_inferred_import(self, s: str) -> None:
@classmethod
def visit_tree(cls, tree: ast27.AST, module_name: str) -> "_Py27AstVisitor":
visitor = cls(module_name)
visitor.visit(tree)
return visitor

def _maybe_add_inferred_import(self, s: str) -> None:
if _INFERRED_IMPORT_REGEX.match(s):
self.inferred_imports.add(s)

def visit_Import(self, node) -> None:
def visit_Import(self, node: ast27.Import) -> None:
for alias in node.names:
self.explicit_imports.add(alias.name)

def visit_ImportFrom(self, node) -> None:
rel_module = node.module
abs_module = ".".join(
self._module_parts[0 : -node.level] + ([] if rel_module is None else [rel_module])
)
def visit_ImportFrom(self, node: ast27.ImportFrom) -> None:
rel_module = [] if node.module is None else [node.module]
relative_level = 0 if node.level is None else node.level
abs_module = ".".join(self._module_parts[0:-relative_level] + rel_module)
for alias in node.names:
self.explicit_imports.add(f"{abs_module}.{alias.name}")


class _Py27AstVisitor(ast27.NodeVisitor, _BaseAstVisitor):
def visit_Str(self, node) -> None:
def visit_Str(self, node: ast27.Str) -> None:
val = ensure_text(node.s)
self.maybe_add_inferred_import(val)
self._maybe_add_inferred_import(val)


class _Py3AstVisitor(ast3.NodeVisitor, _BaseAstVisitor):
def visit_Str(self, node) -> None:
self.maybe_add_inferred_import(node.s)
class _CSTVisitor(cst.CSTVisitor):
def __init__(self, module_name: str) -> None:
self._module_parts = module_name.split(".")
self.explicit_imports: Set[str] = set()
self.inferred_imports: Set[str] = set()

@classmethod
def visit_tree(cls, tree: cst.Module, module_name: str) -> "_CSTVisitor":
visitor = cls(module_name)
tree.visit(visitor)
return visitor

class _Py38AstVisitor(ast3.NodeVisitor, _BaseAstVisitor):
# Python 3.8 deprecated the Str node in favor of Constant.
def visit_Constant(self, node) -> None:
if isinstance(node.value, str):
self.maybe_add_inferred_import(node.value)
def _maybe_add_inferred_import(self, s: Union[str, bytes]) -> None:
if isinstance(s, bytes):
return
if _INFERRED_IMPORT_REGEX.match(s):
self.inferred_imports.add(s)

def _flatten_attribute_or_name(self, node: cst.BaseExpression) -> str:
if isinstance(node, cst.Name):
return node.value
if not isinstance(node, cst.Attribute):
raise TypeError(f"Unrecognized cst.BaseExpression subclass: {node}")
inner = self._flatten_attribute_or_name(node.value)
return f"{inner}.{node.attr.value}"

def visit_Import(self, node: cst.Import) -> None:
for alias in node.names:
self.explicit_imports.add(self._flatten_attribute_or_name(alias.name))

def visit_ImportFrom(self, node: cst.ImportFrom) -> None:
rel_module = [] if node.module is None else [self._flatten_attribute_or_name(node.module)]
relative_level = len(node.relative)
abs_module = ".".join(self._module_parts[0:-relative_level] + rel_module)
if isinstance(node.names, cst.ImportStar):
self.explicit_imports.add(f"{abs_module}.*")
else:
for alias in node.names:
self.explicit_imports.add(f"{abs_module}.{alias.name.value}")

def visit_SimpleString(self, node: cst.SimpleString) -> None:
self._maybe_add_inferred_import(node.evaluated_value)
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# Copyright 2020 Pants project contributors (see CONTRIBUTORS.md).
# Licensed under the Apache License, Version 2.0 (see LICENSE).

import sys
import logging
from textwrap import dedent

import pytest

from pants.backend.python.dependency_inference.import_parser import find_python_imports
from pants.backend.python.dependency_inference.import_parser import (
ParsedPythonImports,
find_python_imports,
)


def test_normal_imports() -> None:
Expand Down Expand Up @@ -146,10 +147,6 @@ def test_works_with_python2() -> None:
assert set(imports.inferred_imports) == {"dep.from.bytes", "dep.from.str"}


@pytest.mark.skipif(
sys.version_info[:2] < (3, 8),
reason="Cannot parse Python 3.8 unless Pants is run with Python 3.8.",
)
def test_works_with_python38() -> None:
imports = find_python_imports(
filename="foo.py",
Expand All @@ -169,3 +166,39 @@ def test_works_with_python38() -> None:
)
assert set(imports.explicit_imports) == {"demo", "project.demo.Demo"}
assert set(imports.inferred_imports) == {"dep.from.str"}


def test_debug_log_with_python39(caplog) -> None:
"""We can't parse python 3.9 yet, so we want to be able to provide good debug log."""
caplog.set_level(logging.DEBUG)
imports = find_python_imports(
filename="foo.py",
# See https://www.python.org/dev/peps/pep-0614/ for the newly relaxed decorator expressions.
content=dedent(
"""\
@buttons[0].clicked.connect
def spam():
...
"""
),
module_name="project.app",
)
assert imports == ParsedPythonImports.empty()
assert len(caplog.records) == 2
messages = [rec.getMessage() for rec in caplog.records]

cst_parse_error = dedent(
"""\
Failed to parse foo.py with python 3.8 libCST parser: Syntax Error @ 1:9.
Incomplete input. Encountered '[', but expected '(', or 'NEWLINE'.
@buttons[0].clicked.connect
^"""
)
assert cst_parse_error == messages[0]

ast27_parse_error = dedent(
"""\
Failed to parse foo.py with python 2.7 typed-ast parser: invalid syntax (foo.py, line 1)"""
)
assert ast27_parse_error == messages[1]

0 comments on commit ea542ac

Please sign in to comment.