Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/fetch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ The fetch tool will truncate the response, but by using the `start_index` argume

## Installation

Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust.
Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust. If node.js is not available, the server falls back to readabilipy's Python-only HTML simplifier.

### Using uv (recommended)

Expand Down Expand Up @@ -170,6 +170,10 @@ This can be customized by adding the argument `--user-agent=YourUserAgent` to th

The server can be configured to use a proxy by using the `--proxy-url` argument.

### Customization - HTML simplification

By default, the server uses readabilipy's optional node.js simplifier when node.js is available, and otherwise falls back to the Python-only simplifier. To force the Python-only path even when node.js is installed, add the `--no-readability-js` argument.

## Windows Configuration

If you're experiencing timeout issues on Windows, you may need to set the `PYTHONIOENCODING` environment variable to ensure proper character encoding:
Expand Down
14 changes: 13 additions & 1 deletion src/fetch/src/mcp_server_fetch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,21 @@ def main():
help="Ignore robots.txt restrictions",
)
parser.add_argument("--proxy-url", type=str, help="Proxy URL to use for requests")
parser.add_argument(
"--no-readability-js",
action="store_true",
help="Use readabilipy's Python-only HTML simplifier even when Node.js is installed",
)

args = parser.parse_args()
asyncio.run(serve(args.user_agent, args.ignore_robots_txt, args.proxy_url))
asyncio.run(
serve(
custom_user_agent=args.user_agent,
ignore_robots_txt=args.ignore_robots_txt,
proxy_url=args.proxy_url,
use_readability_js=not args.no_readability_js,
)
)


if __name__ == "__main__":
Expand Down
34 changes: 28 additions & 6 deletions src/fetch/src/mcp_server_fetch/server.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import shutil
from typing import Annotated, Tuple
from urllib.parse import urlparse, urlunparse

Expand All @@ -24,7 +25,11 @@
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"


def extract_content_from_html(html: str) -> str:
def _can_use_readability_js(use_readability_js: bool) -> bool:
return use_readability_js and shutil.which("node") is not None


def extract_content_from_html(html: str, use_readability_js: bool = True) -> str:
"""Extract and convert HTML content to Markdown format.

Args:
Expand All @@ -34,7 +39,7 @@ def extract_content_from_html(html: str) -> str:
Simplified markdown version of the content
"""
ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=True
html, use_readability=_can_use_readability_js(use_readability_js)
)
if not ret["content"]:
return "<error>Page failed to be simplified from HTML</error>"
Expand Down Expand Up @@ -109,7 +114,11 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str, proxy_url:


async def fetch_url(
url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None
url: str,
user_agent: str,
force_raw: bool = False,
proxy_url: str | None = None,
use_readability_js: bool = True,
) -> Tuple[str, str]:
"""
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
Expand Down Expand Up @@ -140,7 +149,9 @@ async def fetch_url(
)

if is_page_html and not force_raw:
return extract_content_from_html(page_raw), ""
return extract_content_from_html(
page_raw, use_readability_js=use_readability_js
), ""

return (
page_raw,
Expand Down Expand Up @@ -182,13 +193,15 @@ async def serve(
custom_user_agent: str | None = None,
ignore_robots_txt: bool = False,
proxy_url: str | None = None,
use_readability_js: bool = True,
) -> None:
"""Run the fetch MCP server.

Args:
custom_user_agent: Optional custom User-Agent string to use for requests
ignore_robots_txt: Whether to ignore robots.txt restrictions
proxy_url: Optional proxy URL to use for requests
use_readability_js: Whether to use readabilipy's optional Node.js simplifier
"""
server = Server("mcp-fetch")
user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
Expand Down Expand Up @@ -235,7 +248,11 @@ async def call_tool(name, arguments: dict) -> list[TextContent]:
await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url)

content, prefix = await fetch_url(
url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url
url,
user_agent_autonomous,
force_raw=args.raw,
proxy_url=proxy_url,
use_readability_js=use_readability_js,
)
original_length = len(content)
if args.start_index >= original_length:
Expand All @@ -262,7 +279,12 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
url = arguments["url"]

try:
content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url)
content, prefix = await fetch_url(
url,
user_agent_manual,
proxy_url=proxy_url,
use_readability_js=use_readability_js,
)
# TODO: after SDK bug is addressed, don't catch the exception
except McpError as e:
return GetPromptResult(
Expand Down
79 changes: 79 additions & 0 deletions src/fetch/tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,54 @@ def test_empty_content_returns_error(self):
result = extract_content_from_html(html)
assert "<error>" in result

def test_uses_readability_js_when_node_is_available(self):
html = "<html><body><article><p>Hello</p></article></body></html>"

with (
patch("mcp_server_fetch.server.shutil.which", return_value="node"),
patch(
"mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string"
) as mock_simple_json,
):
mock_simple_json.return_value = {"content": "<p>Hello</p>"}

result = extract_content_from_html(html)

mock_simple_json.assert_called_once_with(html, use_readability=True)
assert "Hello" in result

def test_falls_back_without_node(self):
html = "<html><body><article><p>Hello</p></article></body></html>"

with (
patch("mcp_server_fetch.server.shutil.which", return_value=None),
patch(
"mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string"
) as mock_simple_json,
):
mock_simple_json.return_value = {"content": "<p>Hello</p>"}

result = extract_content_from_html(html)

mock_simple_json.assert_called_once_with(html, use_readability=False)
assert "Hello" in result

def test_can_disable_readability_js(self):
html = "<html><body><article><p>Hello</p></article></body></html>"

with (
patch("mcp_server_fetch.server.shutil.which", return_value="node"),
patch(
"mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string"
) as mock_simple_json,
):
mock_simple_json.return_value = {"content": "<p>Hello</p>"}

result = extract_content_from_html(html, use_readability_js=False)

mock_simple_json.assert_called_once_with(html, use_readability=False)
assert "Hello" in result


class TestCheckMayAutonomouslyFetchUrl:
"""Tests for check_may_autonomously_fetch_url function."""
Expand Down Expand Up @@ -219,6 +267,37 @@ async def test_fetch_html_page(self):
assert isinstance(content, str)
assert prefix == ""

@pytest.mark.asyncio
async def test_fetch_html_forwards_readability_js_option(self):
"""Test that fetch_url forwards the readability JS option."""
html_content = "<html><body><h1>Test</h1></body></html>"
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.text = html_content
mock_response.headers = {"content-type": "text/html"}

with (
patch("httpx.AsyncClient") as mock_client_class,
patch(
"mcp_server_fetch.server.extract_content_from_html",
return_value="Test",
) as mock_extract,
):
mock_client = AsyncMock()
mock_client.get = AsyncMock(return_value=mock_response)
mock_client_class.return_value.__aenter__ = AsyncMock(return_value=mock_client)
mock_client_class.return_value.__aexit__ = AsyncMock(return_value=None)

content, prefix = await fetch_url(
"https://example.com/page",
DEFAULT_USER_AGENT_AUTONOMOUS,
use_readability_js=False,
)

mock_extract.assert_called_once_with(html_content, use_readability_js=False)
assert content == "Test"
assert prefix == ""

@pytest.mark.asyncio
async def test_fetch_html_page_raw(self):
"""Test fetching an HTML page with raw=True returns original HTML."""
Expand Down
Loading