From d1cec6606e444d0124cc21a5af67b2c2cfe75248 Mon Sep 17 00:00:00 2001 From: Mike Levin Date: Wed, 22 Apr 2026 05:28:22 -0400 Subject: [PATCH] Adding rel to the preserved whitelisted attributes during HTML and DOM simplification --- tools/scraper_tools.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/scraper_tools.py b/tools/scraper_tools.py index 4d410f0f..49fc1c9d 100644 --- a/tools/scraper_tools.py +++ b/tools/scraper_tools.py @@ -58,12 +58,13 @@ def _simplify_html_for_llm(html_content, default_title=""): # Remove all noise elements that confuse LLMs (Added 'svg' to the hit list!) for tag in soup(['script', 'style', 'noscript', 'meta', 'link', 'head', 'svg']): tag.decompose() - + # Clean up attributes - keep only automation-relevant ones for element in soup.find_all(): attrs_to_keep = {} for attr, value in element.attrs.items(): - if attr in ['id', 'role', 'data-testid', 'name', 'type', 'href', 'src', 'class', 'for', 'value', 'placeholder', 'title'] or attr.startswith('aria-'): + # Added 'rel' and 'target' to preserve SEO link data! + if attr in ['id', 'role', 'data-testid', 'name', 'type', 'href', 'src', 'class', 'for', 'value', 'placeholder', 'title', 'rel', 'target'] or attr.startswith('aria-'): attrs_to_keep[attr] = value element.attrs = attrs_to_keep