Skip to content

Commit 87a2ffc

Browse files
committed
fix: send browser user-agent for HTTP conversions
1 parent a51f725 commit 87a2ffc

2 files changed

Lines changed: 36 additions & 1 deletion

File tree

packages/markitdown/src/markitdown/_markitdown.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@
1919
from ._stream_info import StreamInfo
2020
from ._uri_utils import parse_data_uri, file_uri_to_path
2121

22+
DEFAULT_HTTP_USER_AGENT = (
23+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
24+
"AppleWebKit/537.36 (KHTML, like Gecko) "
25+
"Chrome/124.0.0.0 Safari/537.36"
26+
)
27+
2228
from .converters import (
2329
PlainTextConverter,
2430
HtmlConverter,
@@ -449,7 +455,8 @@ def convert_uri(
449455
)
450456
# HTTP/HTTPS URIs
451457
elif uri.startswith("http:") or uri.startswith("https:"):
452-
response = self._requests_session.get(uri, stream=True)
458+
headers = {"User-Agent": DEFAULT_HTTP_USER_AGENT}
459+
response = self._requests_session.get(uri, stream=True, headers=headers)
453460
response.raise_for_status()
454461
return self.convert_response(
455462
response,

packages/markitdown/tests/test_module_misc.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,3 +552,31 @@ def test_markitdown_llm() -> None:
552552
test()
553553
print("OK")
554554
print("All tests passed!")
555+
556+
557+
def test_http_uri_uses_browser_user_agent() -> None:
558+
markitdown = MarkItDown()
559+
560+
response = MagicMock()
561+
response.headers = {"content-type": "text/html"}
562+
response.url = "https://example.com/test.html"
563+
response.raise_for_status.return_value = None
564+
565+
expected_result = MagicMock()
566+
markitdown.convert_response = MagicMock(return_value=expected_result)
567+
markitdown._requests_session.get = MagicMock(return_value=response)
568+
569+
result = markitdown.convert("https://example.com/test.html")
570+
571+
assert result is expected_result
572+
markitdown._requests_session.get.assert_called_once()
573+
_, kwargs = markitdown._requests_session.get.call_args
574+
assert kwargs["stream"] is True
575+
assert "headers" in kwargs
576+
assert kwargs["headers"]["User-Agent"].startswith("Mozilla/5.0")
577+
markitdown.convert_response.assert_called_once_with(
578+
response,
579+
stream_info=None,
580+
file_extension=None,
581+
url=None,
582+
)

0 commit comments

Comments
 (0)