-
Notifications
You must be signed in to change notification settings - Fork 17
Enhancing URL detection #57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0dbaa66
3f2711b
30893a6
7ce778f
78ba233
bb55585
9ad835d
0dd7311
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -27,14 +27,21 @@ | |||||
| from typing import Any | ||||||
| from urllib.parse import ParseResult, urlparse | ||||||
|
|
||||||
| from pydantic import BaseModel, Field | ||||||
| from pydantic import BaseModel, Field, field_validator | ||||||
|
|
||||||
| from guardrails.registry import default_spec_registry | ||||||
| from guardrails.spec import GuardrailSpecMetadata | ||||||
| from guardrails.types import GuardrailResult | ||||||
|
|
||||||
| __all__ = ["urls"] | ||||||
|
|
||||||
| DEFAULT_PORTS = { | ||||||
| "http": 80, | ||||||
| "https": 443, | ||||||
| } | ||||||
|
|
||||||
| SCHEME_PREFIX_RE = re.compile(r"^[a-z][a-z0-9+.-]*://") | ||||||
|
|
||||||
|
|
||||||
| @dataclass(frozen=True, slots=True) | ||||||
| class UrlDetectionResult: | ||||||
|
|
@@ -66,9 +73,53 @@ class URLConfig(BaseModel): | |||||
| description="Allow subdomains of allowed domains (e.g. api.example.com if example.com is allowed)", | ||||||
| ) | ||||||
|
|
||||||
| @field_validator("allowed_schemes", mode="before") | ||||||
| @classmethod | ||||||
| def normalize_allowed_schemes(cls, value: Any) -> set[str]: | ||||||
| """Normalize allowed schemes to bare identifiers without delimiters.""" | ||||||
| if value is None: | ||||||
| return {"https"} | ||||||
|
|
||||||
| if isinstance(value, str): | ||||||
| raw_values = [value] | ||||||
| else: | ||||||
| raw_values = list(value) | ||||||
|
|
||||||
| normalized: set[str] = set() | ||||||
| for entry in raw_values: | ||||||
| if not isinstance(entry, str): | ||||||
| raise TypeError("allowed_schemes entries must be strings") | ||||||
| cleaned = entry.strip().lower() | ||||||
| if not cleaned: | ||||||
| continue | ||||||
| # Support inputs like "https://", "HTTPS:", or " https " | ||||||
| if cleaned.endswith("://"): | ||||||
| cleaned = cleaned[:-3] | ||||||
| cleaned = cleaned.removesuffix(":") | ||||||
| if cleaned: | ||||||
| normalized.add(cleaned) | ||||||
|
|
||||||
| if not normalized: | ||||||
| raise ValueError("allowed_schemes must include at least one scheme") | ||||||
|
|
||||||
| return normalized | ||||||
|
|
||||||
|
|
||||||
| def _detect_urls(text: str) -> list[str]: | ||||||
| """Detect URLs using regex.""" | ||||||
| """Detect URLs using regex patterns with deduplication. | ||||||
|
|
||||||
| Detects URLs with explicit schemes (http, https, ftp, data, javascript, | ||||||
| vbscript), domain-like patterns without schemes, and IP addresses. | ||||||
| Deduplicates to avoid returning both scheme-ful and scheme-less versions | ||||||
| of the same URL. | ||||||
|
|
||||||
| Args: | ||||||
| text: The text to scan for URLs. | ||||||
|
|
||||||
| Returns: | ||||||
| List of unique URL strings found in the text, with trailing | ||||||
| punctuation removed. | ||||||
| """ | ||||||
| # Pattern for cleaning trailing punctuation (] must be escaped) | ||||||
| PUNCTUATION_CLEANUP = r"[.,;:!?)\]]+$" | ||||||
|
|
||||||
|
|
@@ -155,55 +206,110 @@ def _detect_urls(text: str) -> list[str]: | |||||
| return list(dict.fromkeys([url for url in final_urls if url])) | ||||||
|
|
||||||
|
|
||||||
| def _validate_url_security(url_string: str, config: URLConfig) -> tuple[ParseResult | None, str]: | ||||||
| """Validate URL using stdlib urllib.parse.""" | ||||||
| def _validate_url_security(url_string: str, config: URLConfig) -> tuple[ParseResult | None, str, bool]: | ||||||
| """Validate URL security properties using urllib.parse. | ||||||
|
|
||||||
| Checks URL structure, validates the scheme is allowed, and ensures no | ||||||
| credentials are embedded in userinfo if block_userinfo is enabled. | ||||||
|
|
||||||
| Args: | ||||||
| url_string: The URL string to validate. | ||||||
| config: Configuration specifying allowed schemes and userinfo policy. | ||||||
|
|
||||||
| Returns: | ||||||
| A tuple of (parsed_url, error_reason, had_explicit_scheme). If validation | ||||||
| succeeds, parsed_url is a ParseResult, error_reason is empty, and | ||||||
| had_explicit_scheme indicates if the original URL included a scheme. | ||||||
| If validation fails, parsed_url is None and error_reason describes the failure. | ||||||
| """ | ||||||
| try: | ||||||
| # Parse URL - preserve original scheme for validation | ||||||
| # Parse URL - track whether scheme was explicit | ||||||
| has_explicit_scheme = False | ||||||
| if "://" in url_string: | ||||||
| # Standard URL with double-slash scheme (http://, https://, ftp://, etc.) | ||||||
| parsed_url = urlparse(url_string) | ||||||
| original_scheme = parsed_url.scheme | ||||||
| has_explicit_scheme = True | ||||||
| elif ":" in url_string and url_string.split(":", 1)[0] in {"data", "javascript", "vbscript", "mailto"}: | ||||||
| # Special single-colon schemes | ||||||
| parsed_url = urlparse(url_string) | ||||||
| original_scheme = parsed_url.scheme | ||||||
| has_explicit_scheme = True | ||||||
| else: | ||||||
| # Add http scheme for parsing, but remember this is a default | ||||||
| # Add http scheme for parsing only (user didn't specify a scheme) | ||||||
| parsed_url = urlparse(f"http://{url_string}") | ||||||
| original_scheme = "http" # Default scheme for scheme-less URLs | ||||||
| original_scheme = None # No explicit scheme | ||||||
| has_explicit_scheme = False | ||||||
|
|
||||||
| # Basic validation: must have scheme and netloc (except for special schemes) | ||||||
| if not parsed_url.scheme: | ||||||
| return None, "Invalid URL format" | ||||||
| return None, "Invalid URL format", False | ||||||
|
|
||||||
| # Special schemes like data: and javascript: don't need netloc | ||||||
| special_schemes = {"data", "javascript", "vbscript", "mailto"} | ||||||
| if original_scheme not in special_schemes and not parsed_url.netloc: | ||||||
| return None, "Invalid URL format" | ||||||
| if parsed_url.scheme not in special_schemes and not parsed_url.netloc: | ||||||
| return None, "Invalid URL format", False | ||||||
|
|
||||||
| # Security validations - use original scheme | ||||||
| if original_scheme not in config.allowed_schemes: | ||||||
| return None, f"Blocked scheme: {original_scheme}" | ||||||
| # Security validations - only validate scheme if it was explicitly provided | ||||||
| if has_explicit_scheme and original_scheme not in config.allowed_schemes: | ||||||
| return None, f"Blocked scheme: {original_scheme}", has_explicit_scheme | ||||||
steven10a marked this conversation as resolved.
Show resolved
Hide resolved
Comment on lines
+253
to
+255
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The new scheme check only runs when Useful? React with 👍 / 👎.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. WAI. Same comment as before. |
||||||
|
|
||||||
| if config.block_userinfo and parsed_url.username: | ||||||
| return None, "Contains userinfo (potential credential injection)" | ||||||
| if config.block_userinfo and (parsed_url.username or parsed_url.password): | ||||||
| return None, "Contains userinfo (potential credential injection)", has_explicit_scheme | ||||||
|
|
||||||
| # Everything else (IPs, localhost, private IPs) goes through allow list logic | ||||||
| return parsed_url, "" | ||||||
| return parsed_url, "", has_explicit_scheme | ||||||
|
|
||||||
| except (ValueError, UnicodeError, AttributeError) as e: | ||||||
| # Common URL parsing errors: | ||||||
| # - ValueError: Invalid URL structure, invalid port, etc. | ||||||
| # - UnicodeError: Invalid encoding in URL | ||||||
| # - AttributeError: Unexpected URL structure | ||||||
| return None, f"Invalid URL format: {str(e)}" | ||||||
| return None, f"Invalid URL format: {str(e)}", False | ||||||
| except Exception as e: | ||||||
| # Catch any unexpected errors but provide debugging info | ||||||
| return None, f"URL parsing error: {type(e).__name__}: {str(e)}" | ||||||
| return None, f"URL parsing error: {type(e).__name__}: {str(e)}", False | ||||||
|
|
||||||
|
|
||||||
| def _safe_get_port(parsed: ParseResult, scheme: str) -> int | None: | ||||||
| """Safely extract port from ParseResult, handling malformed ports. | ||||||
|
|
||||||
| Args: | ||||||
| parsed: The parsed URL. | ||||||
| scheme: The URL scheme (for default port lookup). | ||||||
|
|
||||||
| Returns: | ||||||
| The port number, the default port for the scheme, or None if invalid. | ||||||
| """ | ||||||
| try: | ||||||
| return parsed.port or DEFAULT_PORTS.get(scheme.lower()) | ||||||
| except ValueError: | ||||||
| # Port is out of range (0-65535) or malformed | ||||||
| return None | ||||||
|
|
||||||
|
|
||||||
| def _is_url_allowed( | ||||||
| parsed_url: ParseResult, | ||||||
| allow_list: list[str], | ||||||
| allow_subdomains: bool, | ||||||
| url_had_explicit_scheme: bool, | ||||||
| ) -> bool: | ||||||
| """Check if parsed URL matches any entry in the allow list. | ||||||
|
|
||||||
| Supports domain names, IP addresses, CIDR blocks, and full URLs with | ||||||
| paths/ports/query strings. Allow list entries without explicit schemes | ||||||
| match any scheme. Entries with schemes must match exactly against URLs | ||||||
| with explicit schemes, but match any scheme-less URL. | ||||||
steven10a marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
|
||||||
| def _is_url_allowed(parsed_url: ParseResult, allow_list: list[str], allow_subdomains: bool) -> bool: | ||||||
| """Check if URL is allowed.""" | ||||||
| Args: | ||||||
| parsed_url: The parsed URL to check. | ||||||
| allow_list: List of allowed URL patterns (domains, IPs, CIDR, full URLs). | ||||||
| allow_subdomains: If True, subdomains of allowed domains are permitted. | ||||||
| url_had_explicit_scheme: Whether the original URL included an explicit scheme. | ||||||
|
|
||||||
| Returns: | ||||||
| True if the URL matches any allow list entry, False otherwise. | ||||||
| """ | ||||||
| if not allow_list: | ||||||
| return False | ||||||
|
|
||||||
|
|
@@ -212,30 +318,109 @@ def _is_url_allowed(parsed_url: ParseResult, allow_list: list[str], allow_subdom | |||||
| return False | ||||||
|
|
||||||
| url_host = url_host.lower() | ||||||
| url_domain = url_host.replace("www.", "") | ||||||
|
||||||
| url_domain = url_host.replace("www.", "") | |
| url_domain = url_host.removeprefix("www.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
WAI. Same comment as above
steven10a marked this conversation as resolved.
Show resolved
Hide resolved
Copilot
AI
Nov 20, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using replace("www.", "") removes all occurrences of "www." from the hostname, not just the prefix. This could lead to unexpected behavior. For example, "www.www.example.com" would be treated as equivalent to "example.com" when matching allow list entries. Consider using removeprefix("www.") instead to only remove the "www." prefix, or document this behavior explicitly if it's intentional.
| allowed_domain = allowed_host.replace("www.", "") | |
| allowed_domain = allowed_host.removeprefix("www.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is [nit] or actually beneficial. Extra instances of www. would clearly be a typo and does not add anything malicious. Without removing the extra www. we would have higher unnecessary mismatches
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] Limited default port coverage: The
DEFAULT_PORTSdictionary only includes mappings for HTTP (80) and HTTPS (443), but the code supports additional schemes like FTP, data, javascript, vbscript, and mailto (as seen in the detection patterns and special scheme handling). FTP typically uses port 21 by default. Consider adding FTP's default port or documenting that only HTTP/HTTPS have default port handling.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nit]