From d29a3da2967187bf18ffe317d2bc1be1bdb2c0fb Mon Sep 17 00:00:00 2001 From: venti <1308199824@qq.com> Date: Sat, 30 May 2026 16:05:01 +0800 Subject: [PATCH] fix: fall back to charset_normalizer when detected charset fails to decode CSV/plain text (fixes #1949) --- .../src/markitdown/converters/_csv_converter.py | 8 ++++++-- .../src/markitdown/converters/_plain_text_converter.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py index 7e9631e1b..888ebb82c 100644 --- a/packages/markitdown/src/markitdown/converters/_csv_converter.py +++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py @@ -42,10 +42,14 @@ def convert( **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Read the file content + raw = file_stream.read() if stream_info.charset: - content = file_stream.read().decode(stream_info.charset) + try: + content = raw.decode(stream_info.charset) + except UnicodeDecodeError: + content = str(from_bytes(raw).best()) else: - content = str(from_bytes(file_stream.read()).best()) + content = str(from_bytes(raw).best()) # Parse CSV content reader = csv.reader(io.StringIO(content)) diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py index 6f1306fe8..abaccb326 100644 --- a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py +++ b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py @@ -63,9 +63,13 @@ def convert( stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: + raw = file_stream.read() if stream_info.charset: - text_content = file_stream.read().decode(stream_info.charset) + try: + text_content = raw.decode(stream_info.charset) + except UnicodeDecodeError: + text_content = str(from_bytes(raw).best()) else: - text_content = str(from_bytes(file_stream.read()).best()) + text_content = str(from_bytes(raw).best()) return DocumentConverterResult(markdown=text_content)