From 476dff3cce2d8605f470cd1fb45b337a527c14ab Mon Sep 17 00:00:00 2001
From: Yufeng He <40085740+he-yufeng@users.noreply.github.com>
Date: Tue, 19 May 2026 12:45:36 +0800
Subject: [PATCH] fix(fetch): fall back without readability js
---
src/fetch/README.md | 6 +-
src/fetch/src/mcp_server_fetch/__init__.py | 14 +++-
src/fetch/src/mcp_server_fetch/server.py | 34 ++++++++--
src/fetch/tests/test_server.py | 79 ++++++++++++++++++++++
4 files changed, 125 insertions(+), 8 deletions(-)
diff --git a/src/fetch/README.md b/src/fetch/README.md
index 2c3e048927..36bf214e26 100644
--- a/src/fetch/README.md
+++ b/src/fetch/README.md
@@ -26,7 +26,7 @@ The fetch tool will truncate the response, but by using the `start_index` argume
## Installation
-Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust.
+Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust. If node.js is not available, the server falls back to readabilipy's Python-only HTML simplifier.
### Using uv (recommended)
@@ -170,6 +170,10 @@ This can be customized by adding the argument `--user-agent=YourUserAgent` to th
The server can be configured to use a proxy by using the `--proxy-url` argument.
+### Customization - HTML simplification
+
+By default, the server uses readabilipy's optional node.js simplifier when node.js is available, and otherwise falls back to the Python-only simplifier. To force the Python-only path even when node.js is installed, add the `--no-readability-js` argument.
+
## Windows Configuration
If you're experiencing timeout issues on Windows, you may need to set the `PYTHONIOENCODING` environment variable to ensure proper character encoding:
diff --git a/src/fetch/src/mcp_server_fetch/__init__.py b/src/fetch/src/mcp_server_fetch/__init__.py
index 09744ce319..c3811fa1b7 100644
--- a/src/fetch/src/mcp_server_fetch/__init__.py
+++ b/src/fetch/src/mcp_server_fetch/__init__.py
@@ -16,9 +16,21 @@ def main():
help="Ignore robots.txt restrictions",
)
parser.add_argument("--proxy-url", type=str, help="Proxy URL to use for requests")
+ parser.add_argument(
+ "--no-readability-js",
+ action="store_true",
+ help="Use readabilipy's Python-only HTML simplifier even when Node.js is installed",
+ )
args = parser.parse_args()
- asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url))
+ asyncio.run(
+ serve(
+ custom_user_agent=args.user_agent,
+ ignore_robots_txt=args.ignore_robots_txt,
+ proxy_url=args.proxy_url,
+ use_readability_js=not args.no_readability_js,
+ )
+ )
if __name__ == "__main__":
diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py
index b42c7b1f6b..8a4bd46a00 100644
--- a/src/fetch/src/mcp_server_fetch/server.py
+++ b/src/fetch/src/mcp_server_fetch/server.py
@@ -1,3 +1,4 @@
+import shutil
from typing import Annotated, Tuple
from urllib.parse import urlparse, urlunparse
@@ -24,7 +25,11 @@
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
-def extract_content_from_html(html: str) -> str:
+def _can_use_readability_js(use_readability_js: bool) -> bool:
+ return use_readability_js and shutil.which("node") is not None
+
+
+def extract_content_from_html(html: str, use_readability_js: bool = True) -> str:
"""Extract and convert HTML content to Markdown format.
Args:
@@ -34,7 +39,7 @@ def extract_content_from_html(html: str) -> str:
Simplified markdown version of the content
"""
ret = readabilipy.simple_json.simple_json_from_html_string(
- html, use_readability=True
+ html, use_readability=_can_use_readability_js(use_readability_js)
)
if not ret["content"]:
return "Page failed to be simplified from HTML"
@@ -109,7 +114,11 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:
async def fetch_url(
- url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None
+ url: str,
+ user_agent: str,
+ force_raw: bool = False,
+ proxy_url: str | None = None,
+ use_readability_js: bool = True,
) -> Tuple[str, str]:
"""
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
@@ -140,7 +149,9 @@ async def fetch_url(
)
if is_page_html and not force_raw:
- return extract_content_from_html(page_raw), ""
+ return extract_content_from_html(
+ page_raw, use_readability_js=use_readability_js
+ ), ""
return (
page_raw,
@@ -182,6 +193,7 @@ async def serve(
custom_user_agent: str | None = None,
ignore_robots_txt: bool = False,
proxy_url: str | None = None,
+ use_readability_js: bool = True,
) -> None:
"""Run the fetch MCP server.
@@ -189,6 +201,7 @@ async def serve(
custom_user_agent: Optional custom User-Agent string to use for requests
ignore_robots_txt: Whether to ignore robots.txt restrictions
proxy_url: Optional proxy URL to use for requests
+ use_readability_js: Whether to use readabilipy's optional Node.js simplifier
"""
server = Server("mcp-fetch")
user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
@@ -235,7 +248,11 @@ async def call_tool(name, arguments: dict) -> list[TextContent]:
await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url)
content, prefix = await fetch_url(
- url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url
+ url,
+ user_agent_autonomous,
+ force_raw=args.raw,
+ proxy_url=proxy_url,
+ use_readability_js=use_readability_js,
)
original_length = len(content)
if args.start_index >= original_length:
@@ -262,7 +279,12 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
url = arguments["url"]
try:
- content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url)
+ content, prefix = await fetch_url(
+ url,
+ user_agent_manual,
+ proxy_url=proxy_url,
+ use_readability_js=use_readability_js,
+ )
# TODO: after SDK bug is addressed, don't catch the exception
except McpError as e:
return GetPromptResult(
diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py
index 96c1cb38c7..9825ab5e7e 100644
--- a/src/fetch/tests/test_server.py
+++ b/src/fetch/tests/test_server.py
@@ -87,6 +87,54 @@ def test_empty_content_returns_error(self):
result = extract_content_from_html(html)
assert "" in result
+ def test_uses_readability_js_when_node_is_available(self):
+ html = "Hello
"
+
+ with (
+ patch("mcp_server_fetch.server.shutil.which", return_value="node"),
+ patch(
+ "mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string"
+ ) as mock_simple_json,
+ ):
+ mock_simple_json.return_value = {"content": "Hello
"}
+
+ result = extract_content_from_html(html)
+
+ mock_simple_json.assert_called_once_with(html, use_readability=True)
+ assert "Hello" in result
+
+ def test_falls_back_without_node(self):
+ html = "Hello
"
+
+ with (
+ patch("mcp_server_fetch.server.shutil.which", return_value=None),
+ patch(
+ "mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string"
+ ) as mock_simple_json,
+ ):
+ mock_simple_json.return_value = {"content": "Hello
"}
+
+ result = extract_content_from_html(html)
+
+ mock_simple_json.assert_called_once_with(html, use_readability=False)
+ assert "Hello" in result
+
+ def test_can_disable_readability_js(self):
+ html = "Hello
"
+
+ with (
+ patch("mcp_server_fetch.server.shutil.which", return_value="node"),
+ patch(
+ "mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string"
+ ) as mock_simple_json,
+ ):
+ mock_simple_json.return_value = {"content": "Hello
"}
+
+ result = extract_content_from_html(html, use_readability_js=False)
+
+ mock_simple_json.assert_called_once_with(html, use_readability=False)
+ assert "Hello" in result
+
class TestCheckMayAutonomouslyFetchUrl:
"""Tests for check_may_autonomously_fetch_url function."""
@@ -219,6 +267,37 @@ async def test_fetch_html_page(self):
assert isinstance(content, str)
assert prefix == ""
+ @pytest.mark.asyncio
+ async def test_fetch_html_forwards_readability_js_option(self):
+ """Test that fetch_url forwards the readability JS option."""
+ html_content = "Test
"
+ mock_response = MagicMock()
+ mock_response.status_code = 200
+ mock_response.text = html_content
+ mock_response.headers = {"content-type": "text/html"}
+
+ with (
+ patch("httpx.AsyncClient") as mock_client_class,
+ patch(
+ "mcp_server_fetch.server.extract_content_from_html",
+ return_value="Test",
+ ) as mock_extract,
+ ):
+ mock_client = AsyncMock()
+ mock_client.get = AsyncMock(return_value=mock_response)
+ mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
+ mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)
+
+ content, prefix = await fetch_url(
+ "https://example.com/page",
+ DEFAULT_USER_AGENT_AUTONOMOUS,
+ use_readability_js=False,
+ )
+
+ mock_extract.assert_called_once_with(html_content, use_readability_js=False)
+ assert content == "Test"
+ assert prefix == ""
+
@pytest.mark.asyncio
async def test_fetch_html_page_raw(self):
"""Test fetching an HTML page with raw=True returns original HTML."""