diff --git a/scripts/find_packages.sh b/scripts/find_packages.sh new file mode 100755 index 000000000..0762565e7 --- /dev/null +++ b/scripts/find_packages.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +# +# This script fetches the list of top PyPI packages and saves them to a file. +# It downloads the data from https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json, +# extracts the top 5000 package names using jq, and saves them to the specified location. +# +# If the destination file already exists, the script will do nothing. +# +# Usage: ./find_packages.sh [FOLDER] [FILE] +# - FOLDER: The destination folder (default: ../src/macaron/resources) +# - FILE: The destination filename (default: popular_packages.txt) +# +# Dependencies: curl, jq. + +# Set default values +DEFAULT_FOLDER="../src/macaron/resources" +DEFAULT_FILE="popular_packages.txt" + +# Override with provided arguments if they exist +FOLDER=${1:-$DEFAULT_FOLDER} +FILE=${2:-$DEFAULT_FILE} + +FULL_PATH="$FOLDER/$FILE" +URL="https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json" + +# Check if file exists +if [ -f "$FULL_PATH" ]; then + echo "$FULL_PATH already exists. Nothing to do." +else + echo "$FULL_PATH not found. Fetching top PyPI packages..." + + # Ensure the directory exists + mkdir -p "$FOLDER" + + # Fetch and process JSON using curl and jq + if curl -s "$URL" | jq -r '.rows[:5000] | sort_by(-.download_count) | .[].project' > "$FULL_PATH"; then + echo "Successfully saved top 5000 packages to $FULL_PATH" + else + echo "Failed to fetch or process package data." + exit 1 + fi +fi diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md index 7aeda9417..ba3cfb8eb 100644 --- a/src/macaron/malware_analyzer/README.md +++ b/src/macaron/malware_analyzer/README.md @@ -56,6 +56,14 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b - **Description**: Checks if the package name is suspiciously similar to any package name in a predefined list of popular packages. The similarity check incorporates the Jaro-Winkler distance and considers keyboard layout proximity to identify potential typosquatting. - **Rule**: Return `HeuristicResult.FAIL` if the similarity ratio between the package name and any popular package name meets or exceeds a defined threshold; otherwise, return `HeuristicResult.PASS`. - **Dependency**: None. + + > **Note**: This heuristic relies on a list of popular packages stored in [`src/macaron/resources/popular_packages.txt`](../resources/popular_packages.txt). Maintainers should periodically update this list by running the [`find_packages.sh`](../../../scripts/find_packages.sh) script from the project root directory. This ensures the typosquatting detection remains effective against the latest popular packages. + > + > Example: + > ```bash + > ./scripts/find_packages.sh + > ``` + > The script will download the top 5000 PyPI packages and update the resource file automatically. ### Source Code Analysis with Semgrep **PyPI Source Code Analyzer** - **Description**: Uses Semgrep, with default rules written in `src/macaron/resources/pypi_malware_rules` and custom rules available by supplying a path to `custom_semgrep_rules` in `defaults.ini`, to scan the package `.tar` source code. diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/typosquatting_presence.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/typosquatting_presence.py index dbeb466b6..87658f714 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/typosquatting_presence.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/typosquatting_presence.py @@ -281,7 +281,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes distance_ratio = self.ratio(package_name, popular_package) if distance_ratio >= self.distance_ratio_threshold: - logger.info( + logger.debug( "Potential typosquatting detected: '%s' is similar to popular package '%s' (ratio: %.3f)", package_name, popular_package,