From 72e451c5513089d7786143983c0c0aa62df640b9 Mon Sep 17 00:00:00 2001 From: octo-patch Date: Fri, 17 Apr 2026 11:31:41 +0800 Subject: [PATCH] fix: write UTF-8 to stdout.buffer to avoid UnicodeEncodeError on non-UTF-8 systems On Windows systems with a non-UTF-8 locale (e.g. GBK on Chinese Windows), running `markitdown file.pdf > output.md` raises: UnicodeEncodeError: 'gbk' codec can't encode character '\u2022' Two problems existed in the previous approach of encoding to sys.stdout.encoding with errors='replace': 1. sys.stdout.encoding can be None when stdout is a raw pipe, causing a TypeError. 2. Characters are silently replaced with '?' (lossy output), which is undesirable when redirecting to a file. Fix by writing UTF-8 encoded bytes directly to sys.stdout.buffer when available. This produces lossless UTF-8 output regardless of the system locale, matching the behaviour of the -o/--output flag. A safe fallback handles the rare case where stdout.buffer is absent. Fixes #1788 --- packages/markitdown/src/markitdown/__main__.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6bb..39eaab767 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -206,12 +206,19 @@ def _handle_output(args, result: DocumentConverterResult): with open(args.output, "w", encoding="utf-8") as f: f.write(result.markdown) else: - # Handle stdout encoding errors more gracefully - print( - result.markdown.encode(sys.stdout.encoding, errors="replace").decode( - sys.stdout.encoding + # Write UTF-8 directly to the underlying binary buffer when available. + # This avoids UnicodeEncodeError on systems whose locale encoding + # (e.g. GBK on Chinese Windows) cannot represent all Unicode characters + # in the markdown output, and also handles the case where + # sys.stdout.encoding is None (e.g. when stdout is a raw pipe). + if hasattr(sys.stdout, "buffer"): + sys.stdout.buffer.write(result.markdown.encode("utf-8")) + sys.stdout.buffer.flush() + else: + encoding = sys.stdout.encoding or "utf-8" + print( + result.markdown.encode(encoding, errors="replace").decode(encoding) ) - ) def _exit_with_error(message: str):