Skip to content

Commit e276945

Browse files
fix: avoid misclassifying sparse pdf prose as tables
1 parent a51f725 commit e276945

2 files changed

Lines changed: 98 additions & 1 deletion

File tree

packages/markitdown/src/markitdown/converters/_pdf_converter.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,8 @@ def _extract_form_content_from_words(page: Any) -> str | None:
278278
break
279279

280280
# If row uses 2+ of the established columns, it's a table row
281-
info["is_table_row"] = len(aligned_columns) >= 2
281+
info["aligned"] = len(aligned_columns)
282+
info["is_table_row"] = info["aligned"] >= 2
282283

283284
# Find table regions (consecutive table rows)
284285
table_regions: list[tuple[int, int]] = [] # (start_idx, end_idx)
@@ -298,6 +299,25 @@ def _extract_form_content_from_words(page: Any) -> str | None:
298299
if len(row_info) > 0 and total_table_rows / len(row_info) < 0.2:
299300
return None
300301

302+
# Number of columns that will be used when extracting cells
303+
num_cols = len(global_columns)
304+
305+
# Extra guard: multi-column academic prose can look like a very wide,
306+
# sparsely populated table. Real form/table pages in this converter tend to
307+
# use a modest number of stable columns; prose pages instead produce many
308+
# tentative columns with only a few populated per row. Reject those before
309+
# formatting markdown tables.
310+
table_row_fill_ratios = [
311+
info.get("aligned", 0) / num_cols
312+
for info in row_info
313+
if info.get("is_table_row")
314+
]
315+
if num_cols > 10 and table_row_fill_ratios:
316+
sorted_fill_ratios = sorted(table_row_fill_ratios)
317+
median_fill_ratio = sorted_fill_ratios[len(sorted_fill_ratios) // 2]
318+
if median_fill_ratio < 0.4:
319+
return None
320+
301321
# Build output - collect table data first, then format with proper column widths
302322
result_lines: list[str] = []
303323
num_cols = len(global_columns)
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
def _make_fake_page(width: float, rows: list[list[dict]]):
2+
class FakePage:
3+
def __init__(self, width: float, rows: list[list[dict]]):
4+
self.width = width
5+
self._words = []
6+
for i, row in enumerate(rows):
7+
y_top = 50 + i * 12
8+
for w in row:
9+
self._words.append(
10+
{
11+
"text": w["text"],
12+
"x0": float(w["x0"]),
13+
"x1": float(w["x0"]) + float(w.get("w", 12)),
14+
"top": float(y_top),
15+
}
16+
)
17+
18+
def extract_words(self, keep_blank_chars=True, x_tolerance=3, y_tolerance=3):
19+
return list(self._words)
20+
21+
return FakePage(width=width, rows=rows)
22+
23+
24+
def test_multicolumn_prose_falls_back_to_text_extraction():
25+
"""Regression: wide multi-column prose should not be emitted as a table.
26+
27+
This page shape mimics the failure mode from issue #120: many tentative
28+
columns are discovered across the page, but each row only uses a small
29+
fraction of them. That is typical of two-column prose with staggered word
30+
positions, not real form/table data.
31+
"""
32+
33+
from markitdown.converters._pdf_converter import _extract_form_content_from_words
34+
35+
# Thirteen stable x positions across the page; each row only touches four of
36+
# them, which should be treated as sparse multi-column prose rather than a
37+
# dense table.
38+
x_positions = [50, 105, 160, 215, 270, 325, 380, 435, 490, 545, 600, 655, 710]
39+
rows = []
40+
for i in range(10):
41+
start = i
42+
selected = x_positions[start : start + 4]
43+
rows.append(
44+
[
45+
{"x0": selected[0], "text": f"alpha{i}"},
46+
{"x0": selected[1], "text": f"beta{i}"},
47+
{"x0": selected[2], "text": f"gamma{i}"},
48+
{"x0": selected[3], "text": f"delta{i}"},
49+
]
50+
)
51+
52+
fake_page = _make_fake_page(width=760, rows=rows)
53+
54+
assert _extract_form_content_from_words(fake_page) is None
55+
56+
57+
def test_wide_dense_table_is_still_extracted():
58+
"""Wide but dense tables should survive the sparse-prose guard."""
59+
60+
from markitdown.converters._pdf_converter import _extract_form_content_from_words
61+
62+
x_positions = [50, 105, 160, 215, 270, 325, 380, 435, 490, 545, 600]
63+
rows = []
64+
for i in range(6):
65+
rows.append(
66+
[
67+
{"x0": x, "text": f"c{col}_{i}"}
68+
for col, x in enumerate(x_positions)
69+
]
70+
)
71+
72+
fake_page = _make_fake_page(width=660, rows=rows)
73+
output = _extract_form_content_from_words(fake_page)
74+
75+
assert output is not None
76+
assert "|" in output
77+

0 commit comments

Comments
 (0)