File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1919from ._stream_info import StreamInfo
2020from ._uri_utils import parse_data_uri , file_uri_to_path
2121
22+ DEFAULT_HTTP_USER_AGENT = (
23+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
24+ "AppleWebKit/537.36 (KHTML, like Gecko) "
25+ "Chrome/124.0.0.0 Safari/537.36"
26+ )
27+
2228from .converters import (
2329 PlainTextConverter ,
2430 HtmlConverter ,
@@ -449,7 +455,8 @@ def convert_uri(
449455 )
450456 # HTTP/HTTPS URIs
451457 elif uri .startswith ("http:" ) or uri .startswith ("https:" ):
452- response = self ._requests_session .get (uri , stream = True )
458+ headers = {"User-Agent" : DEFAULT_HTTP_USER_AGENT }
459+ response = self ._requests_session .get (uri , stream = True , headers = headers )
453460 response .raise_for_status ()
454461 return self .convert_response (
455462 response ,
Original file line number Diff line number Diff line change @@ -552,3 +552,31 @@ def test_markitdown_llm() -> None:
552552 test ()
553553 print ("OK" )
554554 print ("All tests passed!" )
555+
556+
557+ def test_http_uri_uses_browser_user_agent () -> None :
558+ markitdown = MarkItDown ()
559+
560+ response = MagicMock ()
561+ response .headers = {"content-type" : "text/html" }
562+ response .url = "https://example.com/test.html"
563+ response .raise_for_status .return_value = None
564+
565+ expected_result = MagicMock ()
566+ markitdown .convert_response = MagicMock (return_value = expected_result )
567+ markitdown ._requests_session .get = MagicMock (return_value = response )
568+
569+ result = markitdown .convert ("https://example.com/test.html" )
570+
571+ assert result is expected_result
572+ markitdown ._requests_session .get .assert_called_once ()
573+ _ , kwargs = markitdown ._requests_session .get .call_args
574+ assert kwargs ["stream" ] is True
575+ assert "headers" in kwargs
576+ assert kwargs ["headers" ]["User-Agent" ].startswith ("Mozilla/5.0" )
577+ markitdown .convert_response .assert_called_once_with (
578+ response ,
579+ stream_info = None ,
580+ file_extension = None ,
581+ url = None ,
582+ )
You can’t perform that action at this time.
0 commit comments