|
| 1 | +def _make_fake_page(width: float, rows: list[list[dict]]): |
| 2 | + class FakePage: |
| 3 | + def __init__(self, width: float, rows: list[list[dict]]): |
| 4 | + self.width = width |
| 5 | + self._words = [] |
| 6 | + for i, row in enumerate(rows): |
| 7 | + y_top = 50 + i * 12 |
| 8 | + for w in row: |
| 9 | + self._words.append( |
| 10 | + { |
| 11 | + "text": w["text"], |
| 12 | + "x0": float(w["x0"]), |
| 13 | + "x1": float(w["x0"]) + float(w.get("w", 12)), |
| 14 | + "top": float(y_top), |
| 15 | + } |
| 16 | + ) |
| 17 | + |
| 18 | + def extract_words(self, keep_blank_chars=True, x_tolerance=3, y_tolerance=3): |
| 19 | + return list(self._words) |
| 20 | + |
| 21 | + return FakePage(width=width, rows=rows) |
| 22 | + |
| 23 | + |
| 24 | +def test_multicolumn_prose_falls_back_to_text_extraction(): |
| 25 | + """Regression: wide multi-column prose should not be emitted as a table. |
| 26 | +
|
| 27 | + This page shape mimics the failure mode from issue #120: many tentative |
| 28 | + columns are discovered across the page, but each row only uses a small |
| 29 | + fraction of them. That is typical of two-column prose with staggered word |
| 30 | + positions, not real form/table data. |
| 31 | + """ |
| 32 | + |
| 33 | + from markitdown.converters._pdf_converter import _extract_form_content_from_words |
| 34 | + |
| 35 | + # Thirteen stable x positions across the page; each row only touches four of |
| 36 | + # them, which should be treated as sparse multi-column prose rather than a |
| 37 | + # dense table. |
| 38 | + x_positions = [50, 105, 160, 215, 270, 325, 380, 435, 490, 545, 600, 655, 710] |
| 39 | + rows = [] |
| 40 | + for i in range(10): |
| 41 | + start = i |
| 42 | + selected = x_positions[start : start + 4] |
| 43 | + rows.append( |
| 44 | + [ |
| 45 | + {"x0": selected[0], "text": f"alpha{i}"}, |
| 46 | + {"x0": selected[1], "text": f"beta{i}"}, |
| 47 | + {"x0": selected[2], "text": f"gamma{i}"}, |
| 48 | + {"x0": selected[3], "text": f"delta{i}"}, |
| 49 | + ] |
| 50 | + ) |
| 51 | + |
| 52 | + fake_page = _make_fake_page(width=760, rows=rows) |
| 53 | + |
| 54 | + assert _extract_form_content_from_words(fake_page) is None |
| 55 | + |
| 56 | + |
| 57 | +def test_wide_dense_table_is_still_extracted(): |
| 58 | + """Wide but dense tables should survive the sparse-prose guard.""" |
| 59 | + |
| 60 | + from markitdown.converters._pdf_converter import _extract_form_content_from_words |
| 61 | + |
| 62 | + x_positions = [50, 105, 160, 215, 270, 325, 380, 435, 490, 545, 600] |
| 63 | + rows = [] |
| 64 | + for i in range(6): |
| 65 | + rows.append( |
| 66 | + [ |
| 67 | + {"x0": x, "text": f"c{col}_{i}"} |
| 68 | + for col, x in enumerate(x_positions) |
| 69 | + ] |
| 70 | + ) |
| 71 | + |
| 72 | + fake_page = _make_fake_page(width=660, rows=rows) |
| 73 | + output = _extract_form_content_from_words(fake_page) |
| 74 | + |
| 75 | + assert output is not None |
| 76 | + assert "|" in output |
| 77 | + |
0 commit comments