From dffb29fe307c5c8be73dcc5636a14a11e9d33e45 Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Fri, 24 Oct 2025 14:29:06 +1000 Subject: [PATCH 1/6] feat: added functionality to temproarily download and unzip a PyPi wheel for analysis, alongside basic analysis to log inferred build backends. Signed-off-by: Abhinav Pradeep --- .../common_spec/pypi_spec.py | 123 +++++++++++++++ src/macaron/repo_finder/repo_finder_pypi.py | 2 +- .../package_registry/pypi_registry.py | 144 +++++++++++++++++- 3 files changed, 267 insertions(+), 2 deletions(-) diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index 94489883c..54b1797e5 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -3,10 +3,19 @@ """This module includes build specification and helper classes for PyPI packages.""" +import logging +import os +import re +import tomli from packageurl import PackageURL from macaron.build_spec_generator.common_spec.base_spec import BaseBuildSpec, BaseBuildSpecDict +from macaron.errors import SourceCodeError +from macaron.slsa_analyzer.package_registry import pypi_registry +from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo + +logger: logging.Logger = logging.getLogger(__name__) class PyPIBuildSpec( @@ -34,3 +43,117 @@ def resolve_fields(self, purl: PackageURL) -> None: purl: str The target software component Package URL. """ + if purl.type != "pypi": + return + + registry = pypi_registry.PyPIRegistry() + registry.load_defaults() + + registry_info = PackageRegistryInfo( + build_tool_name="pip", + build_tool_purl_type="pypi", + package_registry=registry, + metadata=[], + ) + + pypi_package_json = pypi_registry.find_or_create_pypi_asset(purl.name, purl.version, registry_info) + + if pypi_package_json is not None: + if pypi_package_json.package_json or pypi_package_json.download(dest=""): + build_backends: dict[str, str] = {} + with pypi_package_json.wheel(): + logger.debug("Wheel at %s", pypi_package_json.wheel_path) + # Should only have .dist-info directory + logger.debug("It has directories %s", ",".join(os.listdir(pypi_package_json.wheel_path))) + # Make build-req array + wheel_contents, metadata_contents = self.read_directory(pypi_package_json.wheel_path, purl) + generator, version = self.read_generator_line(wheel_contents) + if generator != "": + build_backends[generator] = version + if generator != "setuptools": + # Apply METADATA heuristics to determine setuptools version + if "License-File" in metadata_contents: + build_backends["setuptools"] = "56.2.0" + elif "Platform: UNKNOWN" in metadata_contents: + build_backends["setuptools"] = "57.5.0" + else: + build_backends["setuptools"] = "67.7.2" + + with pypi_package_json.sourcecode(): + try: + pyproject_content = pypi_package_json.get_sourcecode_file_contents("pyproject.toml") + content = tomli.loads(pyproject_content.decode("utf-8")) + build_system: dict[str, list[str]] = content.get("build-system", {}) + requires_array: list[str] = build_system.get("requires", []) + logger.debug("From pyproject.toml:") + logger.debug(requires_array) + except SourceCodeError: + logger.debug("No pyproject.toml") + + logger.debug("From .dist_info:") + logger.debug(build_backends) + + def read_directory(self, wheel_path: str, purl: PackageURL) -> tuple[str, str]: + """ + Read in the WHEEL and METADATA file from the .dist_info directory. + + Parameters + ---------- + wheel_path : str + Path to the temporary directory where the wheel was + downloaded into. + purl: PackageURL + PURL corresponding to the package being analyzed. + + Returns + ------- + tuple[str, str] + Tuple where the first element is a string of the .dist-info/WHEEL + contents and the second element is a string of the .dist-info/METADATA + contents + """ + # From https://peps.python.org/pep-0427/#escaping-and-unicode + normalized_name = re.sub(r"[^\w\d.]+", "_", purl.name, re.UNICODE) + dist_info = f"{normalized_name}-{purl.version}.dist-info" + logger.debug(dist_info) + + dist_info_path = os.path.join(wheel_path, dist_info) + + if not os.path.isdir(dist_info_path): + return "", "" + + wheel_path = os.path.join(dist_info_path, "WHEEL") + metadata_path = os.path.join(dist_info_path, "METADATA") + + wheel_contents = "" + metadata_contents = "" + + if os.path.exists(wheel_path): + with open(wheel_path, encoding="utf-8") as wheel_file: + wheel_contents = wheel_file.read() + if os.path.exists(metadata_path): + with open(metadata_path, encoding="utf-8") as metadata_file: + metadata_contents = metadata_file.read() + + return wheel_contents, metadata_contents + + def read_generator_line(self, wheel_contents: str) -> tuple[str, str]: + """ + Parse through the "Generator: {build backend} {version}" line of .dist_info/WHEEL. + + Parameters + ---------- + wheel_contents : str + String of the contents of the .dist_info/WHEEL file + + Returns + ------- + tuple[str, str] + Tuple where the first element is the generating build backend and + the second element is its version. + """ + for line in wheel_contents.splitlines(): + if line.startswith("Generator:"): + split_line = line.split(" ") + return split_line[1], split_line[2] + return "", "" diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index c0c273154..c064a80c1 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -58,7 +58,7 @@ def find_repo( pypi_registry = next((registry for registry in PACKAGE_REGISTRIES if isinstance(registry, PyPIRegistry)), None) if not pypi_registry: return "", RepoFinderInfo.PYPI_NO_REGISTRY - pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "") + pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "", "") if not pypi_asset: # This should be unreachable, as the pypi_registry has already been confirmed to be of type PyPIRegistry. diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index becf815de..33e41c7ec 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -12,6 +12,7 @@ import tarfile import tempfile import urllib.parse +import zipfile from collections.abc import Callable, Generator, Iterator from contextlib import contextmanager from dataclasses import dataclass @@ -283,6 +284,66 @@ def download_package_sourcecode(self, url: str) -> str: logger.debug("Temporary download and unzip of %s stored in %s", file_name, temp_dir) return temp_dir + def download_package_wheel(self, url: str) -> str: + """Download the wheel at input url. + + Parameters + ---------- + url: str + The wheel's url. + + Returns + ------- + str + The temp directory storing {distribution}-{version}.dist-info/WHEEL and + {distribution}-{version}.dist-info/METADATA. + + Raises + ------ + InvalidHTTPResponseError + If the HTTP request to the registry fails or an unexpected response is returned. + """ + # Get name of file. + _, _, file_name = url.rpartition("/") + # Remove the .whl to get wheel name + wheel_name = re.sub(r"\.whl$", "", file_name) + # Makes a directory in the OS's temp folder + temp_dir = tempfile.mkdtemp(prefix=f"{wheel_name}_") + # get temp_dir/file_name + wheel_file = os.path.join(temp_dir, file_name) + # Same timeout and size limit as in download_package_sourcecode + timeout = defaults.getint("downloads", "timeout", fallback=120) + size_limit = defaults.getint("downloads", "max_download_size", fallback=10000000) + + if not download_file_with_size_limit(url, {}, wheel_file, timeout, size_limit): + self.cleanup_sourcecode_directory(temp_dir, "Could not download the file.") + + # Wheel is a zip + if not zipfile.is_zipfile(wheel_file): + self.cleanup_sourcecode_directory(temp_dir, f"Unable to extract source code from file {file_name}") + + try: + # For consumer pattern + with zipfile.ZipFile(wheel_file) as zip_file: + members = [] + for member in zip_file.infolist(): + if member.filename.endswith("WHEEL"): + members.append(member) + if member.filename.endswith("METADATA"): + members.append(member) + zip_file.extractall(temp_dir, members) # nosec B202:tarfile_unsafe_members + except zipfile.BadZipFile as bad_zip: + self.cleanup_sourcecode_directory(temp_dir, f"Error extracting wheel: {bad_zip}", bad_zip) + + # Now we should have it like: + # temp_dir/wheel_name.whl + # temp_dir/wheel_name.dist-info/ + + os.remove(wheel_file) + + logger.debug("Temporary download and unzip of %s stored in %s", file_name, temp_dir) + return temp_dir + def get_artifact_hash(self, artifact_url: str) -> str | None: """Return the hash of the artifact found at the passed URL. @@ -496,6 +557,9 @@ class PyPIPackageJsonAsset: #: the source code temporary location name package_sourcecode_path: str + #: the wheel temporary location name + wheel_path: str + #: The size of the asset (in bytes). This attribute is added to match the AssetLocator #: protocol and is not used because pypi API registry does not provide it. @property @@ -615,6 +679,57 @@ def get_sourcecode_url(self, package_type: str = "sdist") -> str | None: return configured_source_url return None + def get_wheel_url(self, tag: str = "none-any") -> str | None: + """Get url of wheel corresponding to specified tag. + + Parameters + ---------- + tag: str + Wheel tag to match. Defaults to none-any. + + Returns + ------- + str | None + URL of the wheel. + """ + if self.component_version: + urls = json_extract(self.package_json, ["releases", self.component_version], list) + else: + # Get the latest version. + urls = json_extract(self.package_json, ["urls"], list) + if not urls: + return None + for distribution in urls: + # Only examine wheels + if distribution.get("packagetype") != "bdist_wheel": + continue + file_name: str = distribution.get("filename") or "" + # Ensure wheel matches tag + # IS CURRENTLY A BIT NAIVE, BUT ALSO INTENTIONAL: + # Do we want to search for "tag" in the wheel name, or force an exact match for the tag? + if not file_name.endswith(f"{tag}.whl"): + continue + # Continue getting url as get_sourcecode_url does + wheel_url: str = distribution.get("url") or "" + if wheel_url: + try: + parsed_url = urllib.parse.urlparse(wheel_url) + except ValueError: + logger.debug("Error occurred while processing the wheel URL %s.", wheel_url) + return None + if self.pypi_registry.fileserver_url_netloc and self.pypi_registry.fileserver_url_scheme: + configured_wheel_url = urllib.parse.ParseResult( + scheme=self.pypi_registry.fileserver_url_scheme, + netloc=self.pypi_registry.fileserver_url_netloc, + path=parsed_url.path, + params="", + query="", + fragment="", + ).geturl() + logger.debug("Found wheel URL: %s", configured_wheel_url) + return configured_wheel_url + return None + def get_latest_release_upload_time(self) -> str | None: """Get upload time of the latest release. @@ -629,6 +744,33 @@ def get_latest_release_upload_time(self) -> str | None: return upload_time return None + @contextmanager + def wheel(self) -> Generator[None]: + """Download and cleanup wheel of the package with a context manager.""" + if not self.download_wheel(): + raise SourceCodeError("Unable to download requested wheel.") + yield + if self.wheel_path: + # Name for cleanup_sourcecode_directory could be refactored here + PyPIRegistry.cleanup_sourcecode_directory(self.wheel_path) + + def download_wheel(self) -> bool: + """Download and extract wheel metadata to a temporary directory. + + Returns + ------- + bool + ``True`` if the wheel is downloaded and extracted successfully; ``False`` if not. + """ + url = self.get_wheel_url() + if url: + try: + self.wheel_path = self.pypi_registry.download_package_wheel(url) + return True + except InvalidHTTPResponseError as error: + logger.debug(error) + return False + @contextmanager def sourcecode(self) -> Generator[None]: """Download and cleanup source code of the package with a context manager.""" @@ -799,6 +941,6 @@ def find_or_create_pypi_asset( logger.debug("Failed to create PyPIPackageJson asset.") return None - asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {}, "") + asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {}, "", "") pypi_registry_info.metadata.append(asset) return asset From 238581dfcfff1ca39392427f82add7dc48f4a862 Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Mon, 27 Oct 2025 15:53:53 +1000 Subject: [PATCH 2/6] feat: merge in build backend data from pyproject.toml and .dist_info files, which now populates a build_backends field of BaseBuildSpecDict. Signed-off-by: Abhinav Pradeep --- .../common_spec/base_spec.py | 5 ++ .../common_spec/pypi_spec.py | 55 ++++++++++++++----- src/macaron/config/defaults.ini | 6 ++ 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/src/macaron/build_spec_generator/common_spec/base_spec.py b/src/macaron/build_spec_generator/common_spec/base_spec.py index 9fe4e0e0d..18c1b334a 100644 --- a/src/macaron/build_spec_generator/common_spec/base_spec.py +++ b/src/macaron/build_spec_generator/common_spec/base_spec.py @@ -73,6 +73,11 @@ class BaseBuildSpecDict(TypedDict, total=False): #: Entry point script, class, or binary for running the project. entry_point: NotRequired[str | None] + # A "back end" is tool that a "front end" (such as pip/build) would call to + # package the source distribution into the wheel format. build_backends would + # be a list of these that were used in building the wheel alongside their version. + build_backends: NotRequired[dict[str, str]] + class BaseBuildSpec(ABC): """Abstract base class for build specification behavior and field resolution.""" diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index 54b1797e5..478d91560 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -9,8 +9,10 @@ import tomli from packageurl import PackageURL +from packaging.requirements import InvalidRequirement, Requirement from macaron.build_spec_generator.common_spec.base_spec import BaseBuildSpec, BaseBuildSpecDict +from macaron.config.defaults import defaults from macaron.errors import SourceCodeError from macaron.slsa_analyzer.package_registry import pypi_registry from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo @@ -60,6 +62,7 @@ def resolve_fields(self, purl: PackageURL) -> None: if pypi_package_json is not None: if pypi_package_json.package_json or pypi_package_json.download(dest=""): + requires_array: list[str] = [] build_backends: dict[str, str] = {} with pypi_package_json.wheel(): logger.debug("Wheel at %s", pypi_package_json.wheel_path) @@ -69,30 +72,52 @@ def resolve_fields(self, purl: PackageURL) -> None: wheel_contents, metadata_contents = self.read_directory(pypi_package_json.wheel_path, purl) generator, version = self.read_generator_line(wheel_contents) if generator != "": - build_backends[generator] = version + build_backends[generator] = "==" + version if generator != "setuptools": # Apply METADATA heuristics to determine setuptools version if "License-File" in metadata_contents: - build_backends["setuptools"] = "56.2.0" + build_backends["setuptools"] = "==" + defaults.get( + "heuristic.pypi", "setuptools_version_emitting_license" + ) elif "Platform: UNKNOWN" in metadata_contents: - build_backends["setuptools"] = "57.5.0" + build_backends["setuptools"] = "==" + defaults.get( + "heuristic.pypi", "setuptools_version_emitting_platform_unknown" + ) else: - build_backends["setuptools"] = "67.7.2" - - with pypi_package_json.sourcecode(): - try: - pyproject_content = pypi_package_json.get_sourcecode_file_contents("pyproject.toml") - content = tomli.loads(pyproject_content.decode("utf-8")) - build_system: dict[str, list[str]] = content.get("build-system", {}) - requires_array: list[str] = build_system.get("requires", []) - logger.debug("From pyproject.toml:") - logger.debug(requires_array) - except SourceCodeError: - logger.debug("No pyproject.toml") + build_backends["setuptools"] = "==" + defaults.get("heuristic.pypi", "default_setuptools") logger.debug("From .dist_info:") logger.debug(build_backends) + try: + with pypi_package_json.sourcecode(): + try: + pyproject_content = pypi_package_json.get_sourcecode_file_contents("pyproject.toml") + content = tomli.loads(pyproject_content.decode("utf-8")) + build_system: dict[str, list[str]] = content.get("build-system", {}) + requires_array = build_system.get("requires", []) + logger.debug("From pyproject.toml:") + logger.debug(requires_array) + except SourceCodeError: + logger.debug("No pyproject.toml") + except SourceCodeError: + logger.debug("No pyproject.toml") + + # Merge in pyproject.toml information only when the wheel dist_info does not contain the same + # Hatch is an interesting example of this merge being required. + for requirement in requires_array: + try: + parsed_requirement = Requirement(requirement) + if parsed_requirement.name not in build_backends: + build_backends[parsed_requirement.name] = str(parsed_requirement.specifier) + except InvalidRequirement: + logger.debug("Malformed requirement encountered:") + logger.debug(requirement) + + logger.debug("Combined:") + logger.debug(build_backends) + self.data["build_backends"] = build_backends + def read_directory(self, wheel_path: str, purl: PackageURL) -> tuple[str, str]: """ Read in the WHEEL and METADATA file from the .dist_info directory. diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 892da7b50..8f4354e3c 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -644,3 +644,9 @@ custom_semgrep_rules_path = # .yaml prefix. Note, this will be ignored if a path to custom semgrep rules is not provided. This list may not contain # duplicated elements, meaning that ruleset names must be unique. disabled_custom_rulesets = +# As per https://peps.python.org/pep-0639/appendix-examples/, presumably most versions < 59.1.1 will work here +setuptools_version_emitting_license = 56.2.0 + +setuptools_version_emitting_platform_unknown = 57.5.0 + +default_setuptools = 67.7.2 From 9a6b21a452866c72beda2602a3d4d28113e90c45 Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Mon, 27 Oct 2025 16:57:12 +1000 Subject: [PATCH 3/6] fix: better logging and error handling added for when no pure wheel exists for the PURL Signed-off-by: Abhinav Pradeep --- .../common_spec/pypi_spec.py | 51 ++++++++++--------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index 478d91560..f341a1d9e 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -64,27 +64,32 @@ def resolve_fields(self, purl: PackageURL) -> None: if pypi_package_json.package_json or pypi_package_json.download(dest=""): requires_array: list[str] = [] build_backends: dict[str, str] = {} - with pypi_package_json.wheel(): - logger.debug("Wheel at %s", pypi_package_json.wheel_path) - # Should only have .dist-info directory - logger.debug("It has directories %s", ",".join(os.listdir(pypi_package_json.wheel_path))) - # Make build-req array - wheel_contents, metadata_contents = self.read_directory(pypi_package_json.wheel_path, purl) - generator, version = self.read_generator_line(wheel_contents) - if generator != "": - build_backends[generator] = "==" + version - if generator != "setuptools": - # Apply METADATA heuristics to determine setuptools version - if "License-File" in metadata_contents: - build_backends["setuptools"] = "==" + defaults.get( - "heuristic.pypi", "setuptools_version_emitting_license" - ) - elif "Platform: UNKNOWN" in metadata_contents: - build_backends["setuptools"] = "==" + defaults.get( - "heuristic.pypi", "setuptools_version_emitting_platform_unknown" - ) - else: - build_backends["setuptools"] = "==" + defaults.get("heuristic.pypi", "default_setuptools") + try: + with pypi_package_json.wheel(): + logger.debug("Wheel at %s", pypi_package_json.wheel_path) + # Should only have .dist-info directory + logger.debug("It has directories %s", ",".join(os.listdir(pypi_package_json.wheel_path))) + # Make build-req array + wheel_contents, metadata_contents = self.read_directory(pypi_package_json.wheel_path, purl) + generator, version = self.read_generator_line(wheel_contents) + if generator != "": + build_backends[generator] = "==" + version + if generator != "setuptools": + # Apply METADATA heuristics to determine setuptools version + if "License-File" in metadata_contents: + build_backends["setuptools"] = "==" + defaults.get( + "heuristic.pypi", "setuptools_version_emitting_license" + ) + elif "Platform: UNKNOWN" in metadata_contents: + build_backends["setuptools"] = "==" + defaults.get( + "heuristic.pypi", "setuptools_version_emitting_platform_unknown" + ) + else: + build_backends["setuptools"] = "==" + defaults.get( + "heuristic.pypi", "default_setuptools" + ) + except SourceCodeError: + logger.debug("Could not find pure wheel matching this PURL") logger.debug("From .dist_info:") logger.debug(build_backends) @@ -99,9 +104,9 @@ def resolve_fields(self, purl: PackageURL) -> None: logger.debug("From pyproject.toml:") logger.debug(requires_array) except SourceCodeError: - logger.debug("No pyproject.toml") + logger.debug("No pyproject.toml found") except SourceCodeError: - logger.debug("No pyproject.toml") + logger.debug("No source distribution found") # Merge in pyproject.toml information only when the wheel dist_info does not contain the same # Hatch is an interesting example of this merge being required. From 5148c8d2b1df8c0aa68e359d210917a316696050 Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Thu, 30 Oct 2025 15:46:03 +1000 Subject: [PATCH 4/6] test: modified and added integration tests for python buildspec generation Signed-off-by: Abhinav Pradeep --- .../expected_default.buildspec | 1 + .../cases/pypi_cachetools/test.yaml | 32 +++++++++++++++++++ .../pypi_toga/expected_default.buildspec | 2 +- 3 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 tests/integration/cases/pypi_cachetools/expected_default.buildspec create mode 100644 tests/integration/cases/pypi_cachetools/test.yaml diff --git a/tests/integration/cases/pypi_cachetools/expected_default.buildspec b/tests/integration/cases/pypi_cachetools/expected_default.buildspec new file mode 100644 index 000000000..a3a2cde87 --- /dev/null +++ b/tests/integration/cases/pypi_cachetools/expected_default.buildspec @@ -0,0 +1 @@ +{"macaron_version": "0.18.0", "group_id": null, "artifact_id": "cachetools", "version": "6.2.1", "git_repo": "https://github.com/tkem/cachetools", "git_tag": "ca7508fd56103a1b6d6f17c8e93e36c60b44ca25", "newline": "lf", "language_version": "", "ecosystem": "pypi", "purl": "pkg:pypi/cachetools@6.2.1", "language": "python", "build_tool": "pip", "build_commands": [["pip", "install", "coverage", "tox"]], "build_backends": {"setuptools": "==(80.9.0)", "wheel": ""}} diff --git a/tests/integration/cases/pypi_cachetools/test.yaml b/tests/integration/cases/pypi_cachetools/test.yaml new file mode 100644 index 000000000..c2d0575b1 --- /dev/null +++ b/tests/integration/cases/pypi_cachetools/test.yaml @@ -0,0 +1,32 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Analyzing a PyPI PURL that has provenance available on the PyPI registry, and passes the SCM authenticity check. + It also tests buildspec generation. + +tags: +- macaron-python-package +- tutorial + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/cachetools@6.2.1 +- name: Generate the buildspec + kind: gen-build-spec + options: + command_args: + - -purl + - pkg:pypi/cachetools@6.2.1 + - --output-format + - default-buildspec +- name: Compare Buildspec. + kind: compare + options: + kind: default_build_spec + result: output/buildspec/pypi/cachetools/macaron.buildspec + expected: expected_default.buildspec diff --git a/tests/integration/cases/pypi_toga/expected_default.buildspec b/tests/integration/cases/pypi_toga/expected_default.buildspec index ab168ac03..a91f1e137 100644 --- a/tests/integration/cases/pypi_toga/expected_default.buildspec +++ b/tests/integration/cases/pypi_toga/expected_default.buildspec @@ -1 +1 @@ -{"macaron_version": "0.18.0", "group_id": null, "artifact_id": "toga", "version": "0.5.1", "git_repo": "https://github.com/beeware/toga", "git_tag": "ef1912b0a1b5c07793f9aa372409f5b9d36f2604", "newline": "lf", "language_version": "", "ecosystem": "pypi", "purl": "pkg:pypi/toga@0.5.1", "language": "python", "build_tool": "pip", "build_commands": [["pip", "install", "-U", "pip"]]} +{"macaron_version": "0.18.0", "group_id": null, "artifact_id": "toga", "version": "0.5.1", "git_repo": "https://github.com/beeware/toga", "git_tag": "ef1912b0a1b5c07793f9aa372409f5b9d36f2604", "newline": "lf", "language_version": "", "ecosystem": "pypi", "purl": "pkg:pypi/toga@0.5.1", "language": "python", "build_tool": "pip", "build_commands": [["pip", "install", "-U", "pip"]], "build_backends": {"setuptools": "==(80.3.1)", "setuptools_scm": "==8.3.1", "setuptools_dynamic_dependencies": "==1.0.0"}} From 416edfe98823f08bcea13ea8f243a2e7a18531c0 Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Mon, 3 Nov 2025 17:53:12 +1000 Subject: [PATCH 5/6] feat: infer python language version and modify language_version to be list[str] Signed-off-by: Abhinav Pradeep --- .../common_spec/base_spec.py | 2 +- .../build_spec_generator/common_spec/core.py | 2 +- .../common_spec/maven_spec.py | 6 ++++-- .../common_spec/pypi_spec.py | 19 ++++++++++++++++++- .../reproducible_central.py | 2 +- src/macaron/config/defaults.ini | 4 ++-- src/macaron/repo_finder/repo_finder_pypi.py | 2 +- .../package_registry/pypi_registry.py | 12 +++++++----- .../test_reproducible_central.py | 2 +- .../computer-k8s/expected_default.buildspec | 2 +- .../expected_default.buildspec | 2 +- .../pypi_toga/expected_default.buildspec | 2 +- 12 files changed, 39 insertions(+), 18 deletions(-) diff --git a/src/macaron/build_spec_generator/common_spec/base_spec.py b/src/macaron/build_spec_generator/common_spec/base_spec.py index 18c1b334a..92d4b9866 100644 --- a/src/macaron/build_spec_generator/common_spec/base_spec.py +++ b/src/macaron/build_spec_generator/common_spec/base_spec.py @@ -50,7 +50,7 @@ class BaseBuildSpecDict(TypedDict, total=False): newline: NotRequired[str] #: The version of the programming language or runtime, e.g., '11' for JDK, '3.11' for Python. - language_version: Required[str] + language_version: Required[list[str]] #: List of release dependencies. dependencies: NotRequired[list[str]] diff --git a/src/macaron/build_spec_generator/common_spec/core.py b/src/macaron/build_spec_generator/common_spec/core.py index e7f8610e3..d13ff3d2c 100644 --- a/src/macaron/build_spec_generator/common_spec/core.py +++ b/src/macaron/build_spec_generator/common_spec/core.py @@ -442,7 +442,7 @@ def gen_generic_build_spec( "git_repo": latest_component_repository.remote_path, "git_tag": latest_component_repository.commit_sha, "newline": "lf", - "language_version": lang_version or "", + "language_version": [lang_version] if lang_version else [], "ecosystem": purl.type, "purl": str(purl), "language": target_language, diff --git a/src/macaron/build_spec_generator/common_spec/maven_spec.py b/src/macaron/build_spec_generator/common_spec/maven_spec.py index f08602f28..ddfe96b71 100644 --- a/src/macaron/build_spec_generator/common_spec/maven_spec.py +++ b/src/macaron/build_spec_generator/common_spec/maven_spec.py @@ -58,12 +58,14 @@ def resolve_fields(self, purl: PackageURL) -> None: jdk_from_jar or "Cannot find any.", ) + existing = self.data["language_version"][0] if self.data["language_version"] else None + # Select JDK from jar or another source, with a default of version 8. - selected_jdk_version = jdk_from_jar or self.data["language_version"] if self.data["language_version"] else "8" + selected_jdk_version = jdk_from_jar or existing if existing else "8" major_jdk_version = normalize_jdk_version(selected_jdk_version) if not major_jdk_version: logger.error("Failed to obtain the major version of %s", selected_jdk_version) return - self.data["language_version"] = major_jdk_version + self.data["language_version"] = [major_jdk_version] diff --git a/src/macaron/build_spec_generator/common_spec/pypi_spec.py b/src/macaron/build_spec_generator/common_spec/pypi_spec.py index f341a1d9e..9c95e3dab 100644 --- a/src/macaron/build_spec_generator/common_spec/pypi_spec.py +++ b/src/macaron/build_spec_generator/common_spec/pypi_spec.py @@ -10,6 +10,7 @@ import tomli from packageurl import PackageURL from packaging.requirements import InvalidRequirement, Requirement +from packaging.utils import InvalidWheelFilename, parse_wheel_filename from macaron.build_spec_generator.common_spec.base_spec import BaseBuildSpec, BaseBuildSpecDict from macaron.config.defaults import defaults @@ -64,12 +65,12 @@ def resolve_fields(self, purl: PackageURL) -> None: if pypi_package_json.package_json or pypi_package_json.download(dest=""): requires_array: list[str] = [] build_backends: dict[str, str] = {} + python_version_list: list[str] = [] try: with pypi_package_json.wheel(): logger.debug("Wheel at %s", pypi_package_json.wheel_path) # Should only have .dist-info directory logger.debug("It has directories %s", ",".join(os.listdir(pypi_package_json.wheel_path))) - # Make build-req array wheel_contents, metadata_contents = self.read_directory(pypi_package_json.wheel_path, purl) generator, version = self.read_generator_line(wheel_contents) if generator != "": @@ -101,6 +102,9 @@ def resolve_fields(self, purl: PackageURL) -> None: content = tomli.loads(pyproject_content.decode("utf-8")) build_system: dict[str, list[str]] = content.get("build-system", {}) requires_array = build_system.get("requires", []) + python_version_constraint = content.get("project", {}).get("requires-python") + if python_version_constraint: + python_version_list.append(python_version_constraint) logger.debug("From pyproject.toml:") logger.debug(requires_array) except SourceCodeError: @@ -123,6 +127,19 @@ def resolve_fields(self, purl: PackageURL) -> None: logger.debug(build_backends) self.data["build_backends"] = build_backends + if not python_version_list: + try: + # Get python version specified in the wheel file name + logger.debug(pypi_package_json.wheel_filename) + _, _, _, tags = parse_wheel_filename(pypi_package_json.wheel_filename) + for tag in tags: + python_version_list.append(tag.interpreter) + logger.debug(python_version_list) + except InvalidWheelFilename: + logger.debug("Could not parse wheel file name to extract version") + + self.data["language_version"] = python_version_list + def read_directory(self, wheel_path: str, purl: PackageURL) -> tuple[str, str]: """ Read in the WHEEL and METADATA file from the .dist_info directory. diff --git a/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py b/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py index df9a7b099..ba0b61426 100644 --- a/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py +++ b/src/macaron/build_spec_generator/reproducible_central/reproducible_central.py @@ -95,7 +95,7 @@ def gen_reproducible_central_build_spec(build_spec: BaseBuildSpecDict) -> str | "tool": ReproducibleCentralBuildTool[build_spec["build_tool"].upper()].value, "newline": build_spec["newline"], "buildinfo": f"target/{build_spec['artifact_id']}-{build_spec['version']}.buildinfo", - "jdk": build_spec["language_version"], + "jdk": build_spec["language_version"][0], "command": compose_shell_commands(build_spec["build_commands"]), } diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 8f4354e3c..c036598ca 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -646,7 +646,7 @@ custom_semgrep_rules_path = disabled_custom_rulesets = # As per https://peps.python.org/pep-0639/appendix-examples/, presumably most versions < 59.1.1 will work here setuptools_version_emitting_license = 56.2.0 - +# TODO: Investigate if other versions would be suitable setuptools_version_emitting_platform_unknown = 57.5.0 - +# TODO: Investigate if other versions would be suitable default_setuptools = 67.7.2 diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index c064a80c1..7adcede4a 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -58,7 +58,7 @@ def find_repo( pypi_registry = next((registry for registry in PACKAGE_REGISTRIES if isinstance(registry, PyPIRegistry)), None) if not pypi_registry: return "", RepoFinderInfo.PYPI_NO_REGISTRY - pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "", "") + pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "", "", "") if not pypi_asset: # This should be unreachable, as the pypi_registry has already been confirmed to be of type PyPIRegistry. diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 33e41c7ec..0f61e4037 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -331,6 +331,7 @@ def download_package_wheel(self, url: str) -> str: members.append(member) if member.filename.endswith("METADATA"): members.append(member) + # Intended suppression. The tool is unable to see that .extractall is being called with a filter zip_file.extractall(temp_dir, members) # nosec B202:tarfile_unsafe_members except zipfile.BadZipFile as bad_zip: self.cleanup_sourcecode_directory(temp_dir, f"Error extracting wheel: {bad_zip}", bad_zip) @@ -560,6 +561,9 @@ class PyPIPackageJsonAsset: #: the wheel temporary location name wheel_path: str + #: name of the wheel file + wheel_filename: str + #: The size of the asset (in bytes). This attribute is added to match the AssetLocator #: protocol and is not used because pypi API registry does not provide it. @property @@ -704,12 +708,10 @@ def get_wheel_url(self, tag: str = "none-any") -> str | None: if distribution.get("packagetype") != "bdist_wheel": continue file_name: str = distribution.get("filename") or "" - # Ensure wheel matches tag - # IS CURRENTLY A BIT NAIVE, BUT ALSO INTENTIONAL: - # Do we want to search for "tag" in the wheel name, or force an exact match for the tag? if not file_name.endswith(f"{tag}.whl"): continue - # Continue getting url as get_sourcecode_url does + self.wheel_filename = file_name + # Continue to getting url wheel_url: str = distribution.get("url") or "" if wheel_url: try: @@ -941,6 +943,6 @@ def find_or_create_pypi_asset( logger.debug("Failed to create PyPIPackageJson asset.") return None - asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {}, "", "") + asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {}, "", "", "") pypi_registry_info.metadata.append(asset) return asset diff --git a/tests/build_spec_generator/reproducible_central/test_reproducible_central.py b/tests/build_spec_generator/reproducible_central/test_reproducible_central.py index 5b7cbf664..39b8ef5ed 100644 --- a/tests/build_spec_generator/reproducible_central/test_reproducible_central.py +++ b/tests/build_spec_generator/reproducible_central/test_reproducible_central.py @@ -26,7 +26,7 @@ def fixture_base_build_spec() -> BaseBuildSpecDict: "git_tag": "sampletag", "build_tool": "maven", "newline": "lf", - "language_version": "17", + "language_version": ["17"], "build_commands": [["mvn", "package"]], "purl": "pkg:maven/com.oracle/example-artifact@1.2.3", } diff --git a/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec b/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec index 5a05b20d7..1bfeba572 100644 --- a/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec +++ b/tests/integration/cases/org_apache_hugegraph/computer-k8s/expected_default.buildspec @@ -1 +1 @@ -{"macaron_version": "0.18.0", "group_id": "org.apache.hugegraph", "artifact_id": "computer-k8s", "version": "1.0.0", "git_repo": "https://github.com/apache/hugegraph-computer", "git_tag": "d2b95262091d6572cc12dcda57d89f9cd44ac88b", "newline": "lf", "language_version": "11", "ecosystem": "maven", "purl": "pkg:maven/org.apache.hugegraph/computer-k8s@1.0.0", "language": "java", "build_tool": "maven", "build_commands": [["mvn", "-DskipTests=true", "-Dmaven.test.skip=true", "-Dmaven.site.skip=true", "-Drat.skip=true", "-Dmaven.javadoc.skip=true", "clean", "package"]]} +{"macaron_version": "0.18.0", "group_id": "org.apache.hugegraph", "artifact_id": "computer-k8s", "version": "1.0.0", "git_repo": "https://github.com/apache/hugegraph-computer", "git_tag": "d2b95262091d6572cc12dcda57d89f9cd44ac88b", "newline": "lf", "language_version": ["11"], "ecosystem": "maven", "purl": "pkg:maven/org.apache.hugegraph/computer-k8s@1.0.0", "language": "java", "build_tool": "maven", "build_commands": [["mvn", "-DskipTests=true", "-Dmaven.test.skip=true", "-Dmaven.site.skip=true", "-Drat.skip=true", "-Dmaven.javadoc.skip=true", "clean", "package"]]} diff --git a/tests/integration/cases/pypi_cachetools/expected_default.buildspec b/tests/integration/cases/pypi_cachetools/expected_default.buildspec index a3a2cde87..469c99c9d 100644 --- a/tests/integration/cases/pypi_cachetools/expected_default.buildspec +++ b/tests/integration/cases/pypi_cachetools/expected_default.buildspec @@ -1 +1 @@ -{"macaron_version": "0.18.0", "group_id": null, "artifact_id": "cachetools", "version": "6.2.1", "git_repo": "https://github.com/tkem/cachetools", "git_tag": "ca7508fd56103a1b6d6f17c8e93e36c60b44ca25", "newline": "lf", "language_version": "", "ecosystem": "pypi", "purl": "pkg:pypi/cachetools@6.2.1", "language": "python", "build_tool": "pip", "build_commands": [["pip", "install", "coverage", "tox"]], "build_backends": {"setuptools": "==(80.9.0)", "wheel": ""}} +{"macaron_version": "0.18.0", "group_id": null, "artifact_id": "cachetools", "version": "6.2.1", "git_repo": "https://github.com/tkem/cachetools", "git_tag": "ca7508fd56103a1b6d6f17c8e93e36c60b44ca25", "newline": "lf", "language_version": ["py3"], "ecosystem": "pypi", "purl": "pkg:pypi/cachetools@6.2.1", "language": "python", "build_tool": "pip", "build_commands": [["pip", "install", "coverage", "tox"]], "build_backends": {"setuptools": "==(80.9.0)", "wheel": ""}} diff --git a/tests/integration/cases/pypi_toga/expected_default.buildspec b/tests/integration/cases/pypi_toga/expected_default.buildspec index a91f1e137..d5335b3e5 100644 --- a/tests/integration/cases/pypi_toga/expected_default.buildspec +++ b/tests/integration/cases/pypi_toga/expected_default.buildspec @@ -1 +1 @@ -{"macaron_version": "0.18.0", "group_id": null, "artifact_id": "toga", "version": "0.5.1", "git_repo": "https://github.com/beeware/toga", "git_tag": "ef1912b0a1b5c07793f9aa372409f5b9d36f2604", "newline": "lf", "language_version": "", "ecosystem": "pypi", "purl": "pkg:pypi/toga@0.5.1", "language": "python", "build_tool": "pip", "build_commands": [["pip", "install", "-U", "pip"]], "build_backends": {"setuptools": "==(80.3.1)", "setuptools_scm": "==8.3.1", "setuptools_dynamic_dependencies": "==1.0.0"}} +{"macaron_version": "0.18.0", "group_id": null, "artifact_id": "toga", "version": "0.5.1", "git_repo": "https://github.com/beeware/toga", "git_tag": "ef1912b0a1b5c07793f9aa372409f5b9d36f2604", "newline": "lf", "language_version": [">= 3.9"], "ecosystem": "pypi", "purl": "pkg:pypi/toga@0.5.1", "language": "python", "build_tool": "pip", "build_commands": [["pip", "install", "-U", "pip"]], "build_backends": {"setuptools": "==(80.3.1)", "setuptools_scm": "==8.3.1", "setuptools_dynamic_dependencies": "==1.0.0"}} From c4f6b35449d14d5ec48243a72a11d078e9384b13 Mon Sep 17 00:00:00 2001 From: Abhinav Pradeep Date: Mon, 3 Nov 2025 17:57:31 +1000 Subject: [PATCH 6/6] fix: ensuring comment renders correctly Signed-off-by: Abhinav Pradeep --- src/macaron/build_spec_generator/common_spec/base_spec.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/macaron/build_spec_generator/common_spec/base_spec.py b/src/macaron/build_spec_generator/common_spec/base_spec.py index 92d4b9866..11b5e8850 100644 --- a/src/macaron/build_spec_generator/common_spec/base_spec.py +++ b/src/macaron/build_spec_generator/common_spec/base_spec.py @@ -73,9 +73,9 @@ class BaseBuildSpecDict(TypedDict, total=False): #: Entry point script, class, or binary for running the project. entry_point: NotRequired[str | None] - # A "back end" is tool that a "front end" (such as pip/build) would call to - # package the source distribution into the wheel format. build_backends would - # be a list of these that were used in building the wheel alongside their version. + #: A "back end" is tool that a "front end" (such as pip/build) would call to + #: package the source distribution into the wheel format. build_backends would + #: be a list of these that were used in building the wheel alongside their version. build_backends: NotRequired[dict[str, str]]