From f21fede1d46b911e0d39653e7453546a1ba12a90 Mon Sep 17 00:00:00 2001 From: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Date: Sat, 30 May 2026 18:27:27 +0800 Subject: [PATCH] fix: preserve malformed docx math content --- .../converter_utils/docx/math/omml.py | 4 ++- .../converter_utils/docx/pre_process.py | 2 ++ packages/markitdown/tests/test_docx_math.py | 26 +++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 packages/markitdown/tests/test_docx_math.py diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py index dfa734cdc..73d93c5c8 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py @@ -272,7 +272,9 @@ def do_fname(self, elm): if FUNC.get(t): latex_chars.append(FUNC[t]) else: - raise NotImplementedError("Not support func %s" % t) + latex_chars.append( + "\\operatorname{%s}(%s)" % (escape_latex(t), FUNC_PLACE) + ) else: latex_chars.append(t) t = BLANK.join(latex_chars) diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py index d6fa8db69..05e00a610 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py @@ -44,6 +44,8 @@ def _convert_omath_to_latex(tag: Tag) -> str: math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag))) # Find the 'oMath' element within the XML document math_element = math_root.find(OMML_NS + "oMath") + if math_element is None: + return tag.get_text("", strip=True) # Convert the 'oMath' element to LaTeX using the oMath2Latex function latex = oMath2Latex(math_element).latex return latex diff --git a/packages/markitdown/tests/test_docx_math.py b/packages/markitdown/tests/test_docx_math.py new file mode 100644 index 000000000..37bcbf17f --- /dev/null +++ b/packages/markitdown/tests/test_docx_math.py @@ -0,0 +1,26 @@ +from bs4 import BeautifulSoup +from defusedxml import ElementTree as ET + +from markitdown.converter_utils.docx.math.omml import oMath2Latex +from markitdown.converter_utils.docx.pre_process import _convert_omath_to_latex + + +def test_convert_omath_without_namespaced_child_returns_text() -> None: + soup = BeautifulSoup(b"x", "xml") + + assert _convert_omath_to_latex(soup.find("oMath")) == "x" + + +def test_unknown_omml_function_uses_operatorname() -> None: + root = ET.fromstring( + """ + + + log + x + + + """ + ) + + assert oMath2Latex(root).latex == r"\operatorname{log}(x)"