From 72e451c5513089d7786143983c0c0aa62df640b9 Mon Sep 17 00:00:00 2001
From: octo-patch <octo-patch@github.com>
Date: Fri, 17 Apr 2026 11:31:41 +0800
Subject: [PATCH] fix: write UTF-8 to stdout.buffer to avoid UnicodeEncodeError
 on non-UTF-8 systems

On Windows systems with a non-UTF-8 locale (e.g. GBK on Chinese
Windows), running `markitdown file.pdf > output.md` raises:

  UnicodeEncodeError: 'gbk' codec can't encode character '\u2022'

Two problems existed in the previous approach of encoding to
sys.stdout.encoding with errors='replace':
1. sys.stdout.encoding can be None when stdout is a raw pipe,
   causing a TypeError.
2. Characters are silently replaced with '?' (lossy output), which
   is undesirable when redirecting to a file.

Fix by writing UTF-8 encoded bytes directly to sys.stdout.buffer
when available. This produces lossless UTF-8 output regardless of
the system locale, matching the behaviour of the -o/--output flag.
A safe fallback handles the rare case where stdout.buffer is absent.

Fixes #1788
---
 packages/markitdown/src/markitdown/__main__.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py
index 6085ad6bb..39eaab767 100644
--- a/packages/markitdown/src/markitdown/__main__.py
+++ b/packages/markitdown/src/markitdown/__main__.py
@@ -206,12 +206,19 @@ def _handle_output(args, result: DocumentConverterResult):
         with open(args.output, "w", encoding="utf-8") as f:
             f.write(result.markdown)
     else:
-        # Handle stdout encoding errors more gracefully
-        print(
-            result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
-                sys.stdout.encoding
+        # Write UTF-8 directly to the underlying binary buffer when available.
+        # This avoids UnicodeEncodeError on systems whose locale encoding
+        # (e.g. GBK on Chinese Windows) cannot represent all Unicode characters
+        # in the markdown output, and also handles the case where
+        # sys.stdout.encoding is None (e.g. when stdout is a raw pipe).
+        if hasattr(sys.stdout, "buffer"):
+            sys.stdout.buffer.write(result.markdown.encode("utf-8"))
+            sys.stdout.buffer.flush()
+        else:
+            encoding = sys.stdout.encoding or "utf-8"
+            print(
+                result.markdown.encode(encoding, errors="replace").decode(encoding)
             )
-        )
 
 
 def _exit_with_error(message: str):