diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 3df2c8460..4f91baa59 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -29,6 +29,7 @@ from macaron.util import ( can_download_file, download_file_with_size_limit, + html_is_js_challenge, send_get_http_raw, stream_file_with_size_limit, ) @@ -321,6 +322,9 @@ def get_package_page(self, package_name: str) -> str | None: response = send_get_http_raw(url) if response: html_snippets = response.content.decode("utf-8") + if html_is_js_challenge(html_snippets): + logger.debug("URL returned a JavaScript Challenge: %s", url) + return None return html_snippets return None @@ -362,6 +366,9 @@ def get_maintainer_profile_page(self, username: str) -> str | None: response = send_get_http_raw(url, headers=None) if response: html_snippets = response.content.decode("utf-8") + if html_is_js_challenge(html_snippets): + logger.debug("URL returned a JavaScript Challenge: %s", url) + return None return html_snippets return None diff --git a/src/macaron/util.py b/src/macaron/util.py index 5475fb5f0..845117651 100644 --- a/src/macaron/util.py +++ b/src/macaron/util.py @@ -13,6 +13,7 @@ from typing import BinaryIO import requests +from bs4 import BeautifulSoup from requests.models import Response from macaron.config.defaults import defaults @@ -595,3 +596,37 @@ def decode(data: bytes) -> str | None: logger.debug("Failed to decode bytes using most common character encodings.") return None + + +def html_is_js_challenge(html: str) -> bool: + """Check if this HTML is the JavaScript Challenge response. + + The JavaScript Challenge is generally returned to a GET request when a CDN serves some + JavaScript code to be rendered for the page. This usually means the HTML page isn't obtained + when using request libraries that cannot render JavaScript, and this page is returned instead. + + Parameters + ---------- + html: str + The string HTML of the page returned by a request. + + Returns + ------- + bool + True if the page is a JavaScript Challenge html response. False otherwise. + """ + # Main three components: + #