From 476dff3cce2d8605f470cd1fb45b337a527c14ab Mon Sep 17 00:00:00 2001 From: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Date: Tue, 19 May 2026 12:45:36 +0800 Subject: [PATCH] fix(fetch): fall back without readability js --- src/fetch/README.md | 6 +- src/fetch/src/mcp_server_fetch/__init__.py | 14 +++- src/fetch/src/mcp_server_fetch/server.py | 34 ++++++++-- src/fetch/tests/test_server.py | 79 ++++++++++++++++++++++ 4 files changed, 125 insertions(+), 8 deletions(-) diff --git a/src/fetch/README.md b/src/fetch/README.md index 2c3e048927..36bf214e26 100644 --- a/src/fetch/README.md +++ b/src/fetch/README.md @@ -26,7 +26,7 @@ The fetch tool will truncate the response, but by using the `start_index` argume ## Installation -Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust. +Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust. If node.js is not available, the server falls back to readabilipy's Python-only HTML simplifier. ### Using uv (recommended) @@ -170,6 +170,10 @@ This can be customized by adding the argument `--user-agent=YourUserAgent` to th The server can be configured to use a proxy by using the `--proxy-url` argument. +### Customization - HTML simplification + +By default, the server uses readabilipy's optional node.js simplifier when node.js is available, and otherwise falls back to the Python-only simplifier. To force the Python-only path even when node.js is installed, add the `--no-readability-js` argument. + ## Windows Configuration If you're experiencing timeout issues on Windows, you may need to set the `PYTHONIOENCODING` environment variable to ensure proper character encoding: diff --git a/src/fetch/src/mcp_server_fetch/__init__.py b/src/fetch/src/mcp_server_fetch/__init__.py index 09744ce319..c3811fa1b7 100644 --- a/src/fetch/src/mcp_server_fetch/__init__.py +++ b/src/fetch/src/mcp_server_fetch/__init__.py @@ -16,9 +16,21 @@ def main(): help="Ignore robots.txt restrictions", ) parser.add_argument("--proxy-url", type=str, help="Proxy URL to use for requests") + parser.add_argument( + "--no-readability-js", + action="store_true", + help="Use readabilipy's Python-only HTML simplifier even when Node.js is installed", + ) args = parser.parse_args() - asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url)) + asyncio.run( + serve( + custom_user_agent=args.user_agent, + ignore_robots_txt=args.ignore_robots_txt, + proxy_url=args.proxy_url, + use_readability_js=not args.no_readability_js, + ) + ) if __name__ == "__main__": diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index b42c7b1f6b..8a4bd46a00 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -1,3 +1,4 @@ +import shutil from typing import Annotated, Tuple from urllib.parse import urlparse, urlunparse @@ -24,7 +25,11 @@ DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)" -def extract_content_from_html(html: str) -> str: +def _can_use_readability_js(use_readability_js: bool) -> bool: + return use_readability_js and shutil.which("node") is not None + + +def extract_content_from_html(html: str, use_readability_js: bool = True) -> str: """Extract and convert HTML content to Markdown format. Args: @@ -34,7 +39,7 @@ def extract_content_from_html(html: str) -> str: Simplified markdown version of the content """ ret = readabilipy.simple_json.simple_json_from_html_string( - html, use_readability=True + html, use_readability=_can_use_readability_js(use_readability_js) ) if not ret["content"]: return "Page failed to be simplified from HTML" @@ -109,7 +114,11 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url: async def fetch_url( - url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None + url: str, + user_agent: str, + force_raw: bool = False, + proxy_url: str | None = None, + use_readability_js: bool = True, ) -> Tuple[str, str]: """ Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information. @@ -140,7 +149,9 @@ async def fetch_url( ) if is_page_html and not force_raw: - return extract_content_from_html(page_raw), "" + return extract_content_from_html( + page_raw, use_readability_js=use_readability_js + ), "" return ( page_raw, @@ -182,6 +193,7 @@ async def serve( custom_user_agent: str | None = None, ignore_robots_txt: bool = False, proxy_url: str | None = None, + use_readability_js: bool = True, ) -> None: """Run the fetch MCP server. @@ -189,6 +201,7 @@ async def serve( custom_user_agent: Optional custom User-Agent string to use for requests ignore_robots_txt: Whether to ignore robots.txt restrictions proxy_url: Optional proxy URL to use for requests + use_readability_js: Whether to use readabilipy's optional Node.js simplifier """ server = Server("mcp-fetch") user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS @@ -235,7 +248,11 @@ async def call_tool(name, arguments: dict) -> list[TextContent]: await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url) content, prefix = await fetch_url( - url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url + url, + user_agent_autonomous, + force_raw=args.raw, + proxy_url=proxy_url, + use_readability_js=use_readability_js, ) original_length = len(content) if args.start_index >= original_length: @@ -262,7 +279,12 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult: url = arguments["url"] try: - content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url) + content, prefix = await fetch_url( + url, + user_agent_manual, + proxy_url=proxy_url, + use_readability_js=use_readability_js, + ) # TODO: after SDK bug is addressed, don't catch the exception except McpError as e: return GetPromptResult( diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py index 96c1cb38c7..9825ab5e7e 100644 --- a/src/fetch/tests/test_server.py +++ b/src/fetch/tests/test_server.py @@ -87,6 +87,54 @@ def test_empty_content_returns_error(self): result = extract_content_from_html(html) assert "" in result + def test_uses_readability_js_when_node_is_available(self): + html = "

Hello

" + + with ( + patch("mcp_server_fetch.server.shutil.which", return_value="node"), + patch( + "mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string" + ) as mock_simple_json, + ): + mock_simple_json.return_value = {"content": "

Hello

"} + + result = extract_content_from_html(html) + + mock_simple_json.assert_called_once_with(html, use_readability=True) + assert "Hello" in result + + def test_falls_back_without_node(self): + html = "

Hello

" + + with ( + patch("mcp_server_fetch.server.shutil.which", return_value=None), + patch( + "mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string" + ) as mock_simple_json, + ): + mock_simple_json.return_value = {"content": "

Hello

"} + + result = extract_content_from_html(html) + + mock_simple_json.assert_called_once_with(html, use_readability=False) + assert "Hello" in result + + def test_can_disable_readability_js(self): + html = "

Hello

" + + with ( + patch("mcp_server_fetch.server.shutil.which", return_value="node"), + patch( + "mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string" + ) as mock_simple_json, + ): + mock_simple_json.return_value = {"content": "

Hello

"} + + result = extract_content_from_html(html, use_readability_js=False) + + mock_simple_json.assert_called_once_with(html, use_readability=False) + assert "Hello" in result + class TestCheckMayAutonomouslyFetchUrl: """Tests for check_may_autonomously_fetch_url function.""" @@ -219,6 +267,37 @@ async def test_fetch_html_page(self): assert isinstance(content, str) assert prefix == "" + @pytest.mark.asyncio + async def test_fetch_html_forwards_readability_js_option(self): + """Test that fetch_url forwards the readability JS option.""" + html_content = "

Test

" + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = html_content + mock_response.headers = {"content-type": "text/html"} + + with ( + patch("httpx.AsyncClient") as mock_client_class, + patch( + "mcp_server_fetch.server.extract_content_from_html", + return_value="Test", + ) as mock_extract, + ): + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_response) + mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None) + + content, prefix = await fetch_url( + "https://example.com/page", + DEFAULT_USER_AGENT_AUTONOMOUS, + use_readability_js=False, + ) + + mock_extract.assert_called_once_with(html_content, use_readability_js=False) + assert content == "Test" + assert prefix == "" + @pytest.mark.asyncio async def test_fetch_html_page_raw(self): """Test fetching an HTML page with raw=True returns original HTML."""