From 1a904f604b1973a91eff529e0d2bec30d7b51361 Mon Sep 17 00:00:00 2001
From: fuleinist <fuleinist@outlook.com>
Date: Mon, 20 Apr 2026 00:19:12 +0800
Subject: [PATCH] fix: write UTF-8 bytes directly to stdout to avoid encoding
 errors

On Windows (and other platforms) where sys.stdout.encoding is limited
(e.g., cp1252, gbk), piping markitdown output to a file causes
UnicodeEncodeError for characters outside the target encoding.

The previous workaround of encode+decode with errors='replace' still
failed when stdout.encoding was None, and didn't solve the root
issue of stdout's limited encoding.

This fix writes UTF-8 bytes directly to sys.stdout.buffer, which:
- Bypasses stdout's text encoding limitation
- Works reliably when stdout is piped or redirected
- Handles all Unicode characters correctly
- Falls back to print() with encoding='utf-8' for unusual cases

Fixes: microsoft/markitdown#1788
Fixes: microsoft/markitdown#1597
---
 packages/markitdown/src/markitdown/__main__.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index 6085ad6bb..d4b352c6f 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -206,12 +206,14 @@ def _handle_output(args, result: DocumentConverterResult):
         with open(args.output, "w", encoding="utf-8") as f:
             f.write(result.markdown)
     else:
-        # Handle stdout encoding errors more gracefully
-        print(
-            result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
-                sys.stdout.encoding
-            )
-        )
+        # Write UTF-8 bytes directly to stdout's underlying buffer to avoid
+        # encoding errors on Windows (cp1252, gbk, etc.) and piped stdout
+        # where sys.stdout.encoding may be limited or None.
+        if hasattr(sys.stdout, "buffer"):
+            sys.stdout.buffer.write(result.markdown.encode("utf-8"))
+        else:
+            # Fallback for unusual stdout configurations
+            print(result.markdown, encoding="utf-8", errors="replace")
 
 
 def _exit_with_error(message: str):