From 1a904f604b1973a91eff529e0d2bec30d7b51361 Mon Sep 17 00:00:00 2001 From: fuleinist Date: Mon, 20 Apr 2026 00:19:12 +0800 Subject: [PATCH] fix: write UTF-8 bytes directly to stdout to avoid encoding errors On Windows (and other platforms) where sys.stdout.encoding is limited (e.g., cp1252, gbk), piping markitdown output to a file causes UnicodeEncodeError for characters outside the target encoding. The previous workaround of encode+decode with errors='replace' still failed when stdout.encoding was None, and didn't solve the root issue of stdout's limited encoding. This fix writes UTF-8 bytes directly to sys.stdout.buffer, which: - Bypasses stdout's text encoding limitation - Works reliably when stdout is piped or redirected - Handles all Unicode characters correctly - Falls back to print() with encoding='utf-8' for unusual cases Fixes: microsoft/markitdown#1788 Fixes: microsoft/markitdown#1597 --- packages/markitdown/src/markitdown/__main__.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6bb..d4b352c6f 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -206,12 +206,14 @@ def _handle_output(args, result: DocumentConverterResult): with open(args.output, "w", encoding="utf-8") as f: f.write(result.markdown) else: - # Handle stdout encoding errors more gracefully - print( - result.markdown.encode(sys.stdout.encoding, errors="replace").decode( - sys.stdout.encoding - ) - ) + # Write UTF-8 bytes directly to stdout's underlying buffer to avoid + # encoding errors on Windows (cp1252, gbk, etc.) and piped stdout + # where sys.stdout.encoding may be limited or None. + if hasattr(sys.stdout, "buffer"): + sys.stdout.buffer.write(result.markdown.encode("utf-8")) + else: + # Fallback for unusual stdout configurations + print(result.markdown, encoding="utf-8", errors="replace") def _exit_with_error(message: str):