diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/similar_projects.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/similar_projects.py index c5ffb840e..0296cfc38 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/similar_projects.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/similar_projects.py @@ -24,7 +24,18 @@ def __init__(self) -> None: super().__init__( name="similar_project_analyzer", heuristic=Heuristics.SIMILAR_PROJECTS, - depends_on=None, + depends_on=[ + (Heuristics.EMPTY_PROJECT_LINK, HeuristicResult.FAIL), + (Heuristics.ONE_RELEASE, HeuristicResult.FAIL), + (Heuristics.HIGH_RELEASE_FREQUENCY, HeuristicResult.FAIL), + (Heuristics.UNCHANGED_RELEASE, HeuristicResult.FAIL), + (Heuristics.CLOSER_RELEASE_JOIN_DATE, HeuristicResult.FAIL), + (Heuristics.SUSPICIOUS_SETUP, HeuristicResult.FAIL), + (Heuristics.WHEEL_ABSENCE, HeuristicResult.FAIL), + (Heuristics.ANOMALOUS_VERSION, HeuristicResult.FAIL), + (Heuristics.TYPOSQUATTING_PRESENCE, HeuristicResult.FAIL), + (Heuristics.FAKE_EMAIL, HeuristicResult.FAIL), + ], ) def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: @@ -106,6 +117,9 @@ def get_structure(self, package_name: str) -> list[str]: list[str]: The list of files in the package's sdist. """ + # TODO: We should not download the source distributions for every package. + # This is very inefficient. We should find a different way to extract the package + # structure, e.g., the inspector service? sdist_url = self.get_url(package_name) if not sdist_url: logger.debug("Package %s does not have a sdist.", package_name) @@ -117,10 +131,16 @@ def get_structure(self, package_name: str) -> list[str]: return [] buffer = io.BytesIO(response.content) - with tarfile.open(fileobj=buffer, mode="r:gz") as tf: - members = [ - member.name for member in tf.getmembers() if member.name and not member.name.startswith("PAXHeaders/") - ] + try: + with tarfile.open(fileobj=buffer, mode="r:gz") as tf: + members = [ + member.name + for member in tf.getmembers() + if member.name and not member.name.startswith("PAXHeaders/") + ] + except (tarfile.TarError, OSError) as error: + logger.debug("Error reading source code tar file: %s", error) + return [] return members