Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/macaron/slsa_analyzer/package_registry/pypi_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from macaron.util import (
can_download_file,
download_file_with_size_limit,
html_is_js_challenge,
send_get_http_raw,
stream_file_with_size_limit,
)
Expand Down Expand Up @@ -321,6 +322,9 @@ def get_package_page(self, package_name: str) -> str | None:
response = send_get_http_raw(url)
if response:
html_snippets = response.content.decode("utf-8")
if html_is_js_challenge(html_snippets):
logger.debug("URL returned a JavaScript Challenge: %s", url)
return None
return html_snippets
return None

Expand Down Expand Up @@ -362,6 +366,9 @@ def get_maintainer_profile_page(self, username: str) -> str | None:
response = send_get_http_raw(url, headers=None)
if response:
html_snippets = response.content.decode("utf-8")
if html_is_js_challenge(html_snippets):
logger.debug("URL returned a JavaScript Challenge: %s", url)
return None
return html_snippets
return None

Expand Down
35 changes: 35 additions & 0 deletions src/macaron/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import BinaryIO

import requests
from bs4 import BeautifulSoup
from requests.models import Response

from macaron.config.defaults import defaults
Expand Down Expand Up @@ -595,3 +596,37 @@ def decode(data: bytes) -> str | None:

logger.debug("Failed to decode bytes using most common character encodings.")
return None


def html_is_js_challenge(html: str) -> bool:
"""Check if this HTML is the JavaScript Challenge response.

The JavaScript Challenge is generally returned to a GET request when a CDN serves some
JavaScript code to be rendered for the page. This usually means the HTML page isn't obtained
when using request libraries that cannot render JavaScript, and this page is returned instead.

Parameters
----------
html: str
The string HTML of the page returned by a request.

Returns
-------
bool
True if the page is a JavaScript Challenge html response. False otherwise.
"""
# Main three components:
# <html><head><title>Client Challenge
# <html><body><noscript><div><div><span>Javascript is disabled in your browser
# <html><body><noscript><div><div><p>Please enable JavaScript to proceed

soup = BeautifulSoup(html, "html.parser")
title = soup.find("title")
noscript_span = soup.find("span")
noscript_msg = soup.find("p")

has_title = title is not None and "Client Challenge" in title.get_text()
has_span = noscript_span is not None and "JavaScript is disabled in your browser" in noscript_span.get_text()
has_msg = noscript_msg is not None and "Please enable JavaScript to proceed" in noscript_msg.get_text()

return has_title and has_span and has_msg
Loading