In [None]:
# Install poppler-utils for PDF processing
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.10 [186 kB]
Fetched 186 kB in 0s (1,475 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126374 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.10_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.10) ...
Setting up poppler-utils (22.02.0-2ubuntu0.10) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
# -------------------------
# Install requirements
# -------------------------
!pip install python-doctr paddleocr opencv-python pandas tabulate paddlepaddle pdf2image

Collecting python-doctr
  Downloading python_doctr-1.0.0-py3-none-any.whl.metadata (32 kB)
Collecting paddleocr
  Downloading paddleocr-3.2.0-py3-none-any.whl.metadata (29 kB)
Collecting paddlepaddle
  Downloading paddlepaddle-3.2.0-cp312-cp312-manylinux1_x86_64.whl.metadata (8.8 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting onnx<3.0.0,>=1.12.0 (from python-doctr)
  Downloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.0 kB)
Collecting pypdfium2<5.0.0,>=4.11.0 (from python-doctr)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyclipper<2.0.0,>=1.2.0 (from python-doctr)
  Downloading pyclipper-1.3.0.post6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting langdetec

In [None]:
# -------------------------
# Imports
# -------------------------
import os
import cv2
import numpy as np
import pandas as pd
from pathlib import Path
from tabulate import tabulate
import matplotlib.pyplot as plt
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from pdf2image import convert_from_path
from paddleocr import PaddleOCR, LayoutDetection

# Image Utils

In [None]:
def convert_to_images(pdf_path):
  out_dir = Path("ocr_test")
  out_dir.mkdir(exist_ok=True)

  # Convert first page to image
  pages = convert_from_path(pdf_path, dpi=300)
  for idx, i in enumerate(pages):
    page_path = out_dir / f"page{idx}.png"
    pages[idx].save(page_path, "PNG")
    print("Saved:", page_path)

In [None]:
import cv2
import numpy as np
from PIL import Image

def preprocess_image(img_path):
    img = cv2.imread(img_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Denoising
    gray = cv2.medianBlur(gray, 3)

    # Contrast enhancement
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    gray = clahe.apply(gray)

    # # Resize (optional)
    # h, w = gray.shape
    # if max(h, w) < 1024:
    #     gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

    return gray


# Text Utils

### 1. Text  Pre-processing

In [None]:
from typing import List, Dict, Tuple

Word = Dict[str, object]  # {'text': str, 'bbox': Tuple[int, int, int, int]}

# -----------------------------
# 0. Preprocessing
# -----------------------------
def preprocess_words(words: List[Word]) -> List[Word]:
    """
    Clean and normalize extracted words.
    - Strip whitespace
    - Remove empty text
    - Ensure bounding boxes are integers
    """
    processed = []
    for w in words:
        text = w["text"].strip()
        if text:
            processed.append({
                "text": text,
                "bbox": tuple(map(int, w["bbox"]))
            })
    return processed

In [None]:
def group_words_into_lines(words: List[Word], y_threshold: int = 10) -> List[List[Word]]:
    """
    Group words into lines based on their y-coordinates.
    - Sort words by top (y1) of bbox
    - If difference in y < threshold, group into same line
    """
    words = sorted(words, key=lambda w: (w["bbox"][1], w["bbox"][0]))  # sort by y, then x
    lines = []
    current_line = []
    prev_y = None

    for w in words:
        y_top = w["bbox"][1]
        if prev_y is None or abs(y_top - prev_y) <= y_threshold:
            current_line.append(w)
        else:
            lines.append(sorted(current_line, key=lambda x: x["bbox"][0]))  # align x
            current_line = [w]
        prev_y = y_top

    if current_line:
        lines.append(sorted(current_line, key=lambda x: x["bbox"][0]))
    return lines

In [None]:
from typing import List, Union, Dict

Word = Dict[str, Union[str, tuple]]  # text + bbox
Table = Dict[str, str]               # {"table": "..."}
LineOrTable = Union[List[Word], Table]


def group_words_and_tables_into_lines(
    items: List[Union[Word, Table]], y_threshold: int = 10
) -> List[LineOrTable]:
    """
    Group OCR results (words + tables) into structured lines.

    - Words are grouped into lines based on their y-coordinates.
    - Tables are kept as-is, placed in sequence.
    - Returns a mix of grouped word-lines and table dicts.
    """
    # Separate words and tables but maintain sequence
    output = []
    buffer_words = []

    def flush_words():
        """Group accumulated words into lines and push to output."""
        nonlocal buffer_words
        if not buffer_words:
            return
        words_sorted = sorted(buffer_words, key=lambda w: (w["bbox"][1], w["bbox"][0]))
        lines = []
        current_line = []
        prev_y = None
        for w in words_sorted:
            y_top = w["bbox"][1]
            if prev_y is None or abs(y_top - prev_y) <= y_threshold:
                current_line.append(w)
            else:
                lines.append(sorted(current_line, key=lambda x: x["bbox"][0]))
                current_line = [w]
            prev_y = y_top
        if current_line:
            lines.append(sorted(current_line, key=lambda x: x["bbox"][0]))
        output.extend(lines)
        buffer_words = []

    for item in items:
        if isinstance(item, dict) and "table" in item:
            # Flush pending words before inserting table
            flush_words()
            output.append(item)  # keep table as-is
        else:
            buffer_words.append(item)

    # Flush remaining words
    flush_words()

    return output


In [None]:
def merge_line_text(line: List[Word]) -> str:
    """Merge words in a line into full text string."""
    return " ".join([w["text"] for w in line])

### 2. For Text Post-processing (Alignment)

In [None]:
def get_leftmost_x(lines):
    """
    Find the leftmost X-coordinate across all lines and words.

    Args:
        lines (list of list of dict]): OCR results.

    Returns:
        float: Minimum x-coordinate among all words.
    """
    min_x = float('inf')
    for line in lines:
        for word in line:
            min_x = min(min_x, word["bbox"][0])
    return min_x

In [None]:
def get_line_indentation(line, leftmost_x, scale=100):
    """
    Calculate indentation for a line relative to the leftmost word on the page.

    Args:
        line (list of dict]): Words in the line.
        leftmost_x (float): X-coordinate of the leftmost word in the page.
        scale (int): Factor to reduce pixel values to spaces.

    Returns:
        str: Spaces for indentation.
    """
    if not line:
        return ""

    start_x = line[0]["bbox"][0]
    indent = max(0, int((start_x - leftmost_x) / scale))
    return " " * indent

In [None]:
def format_lines_with_spacing(lines, scale=100):
    """
    Format OCR lines with proper spacing and page-relative indentation.

    Args:
        lines (list of list of dict]): OCR results.
        scale (int): Factor to reduce pixel distances into spaces.

    Returns:
        str: Structured text formatted like the scanned page.
    """
    formatted_output = []

    # Find the leftmost x-coordinate of the page
    leftmost_x = get_leftmost_x(lines)

    for line in lines:
        if not line:
            continue

        # Add indentation relative to page leftmost coordinate
        line_text = get_line_indentation(line, leftmost_x, scale=scale)

        for idx, word in enumerate(line):
            if idx == len(line) - 1:
                line_text += word["text"]
                break

            # Calculate spacing between current word and next word
            space = -1 * (line[idx]["bbox"][2] - line[idx + 1]["bbox"][0])
            space = max(1, int(space / scale))

            line_text += word["text"] + (" " * space)

        formatted_output.append(line_text)

    return "\n".join(formatted_output)

In [None]:
def render_text_and_tables_from_ocr(lines, scale=100):
    """
    Format OCR output that may contain both text lines and tables.

    Args:
        lines (list of list of dict or dict]): OCR results after grouping.
            - A line is a list of words (dicts with 'text' and 'bbox').
            - A table is a dict with {"table": str}.
        scale (int): Factor to reduce pixel distances into spaces.

    Returns:
        str: Structured text with tables preserved and text spaced properly.
    """
    formatted_output = []

    # Find leftmost x across all text words (ignore tables)
    text_lines = [line for line in lines if isinstance(line, list)]
    leftmost_x = get_leftmost_x(text_lines) if text_lines else 0

    for line in lines:
        if isinstance(line, dict) and "table" in line:
            # Insert table as-is with spacing
            formatted_output.append("")
            formatted_output.append(line["table"])
            formatted_output.append("")
            continue

        if not line:  # empty line
            continue

        # Normal text line
        line_text = get_line_indentation(line, leftmost_x, scale=scale)

        for idx, word in enumerate(line):
            if "text" not in word:
                continue

            if idx == len(line) - 1:
                line_text += word["text"]
                break

            # Calculate spacing between current word and next word
            space = -1 * (line[idx]["bbox"][2] - line[idx + 1]["bbox"][0])
            space = max(1, int(space / scale))
            line_text += word["text"] + (" " * space)

        formatted_output.append(line_text)

    return "\n".join(formatted_output)


In [None]:
# def format_lines_as_table(lines, scale=100, gap_threshold=3):
#     """
#     Format OCR lines into a table using spacing from bounding boxes.

#     Args:
#         lines (list[list[dict]]): OCR results. Each line is a list of word dicts
#                                   with {"text": str, "bbox": (x0, y0, x1, y1)}.
#         scale (int): Scaling factor to convert pixel distances to spaces.
#         gap_threshold (int): Minimum number of scaled spaces considered a new column.

#     Returns:
#         pd.DataFrame: Table reconstructed from OCR.
#     """
#     table_rows = []

#     for line in lines:
#         if not line:
#             continue

#         row = []
#         current_cell = line[0]["text"]

#         for idx in range(len(line) - 1):
#             this_word = line[idx]
#             next_word = line[idx + 1]

#             # spacing between this word and the next
#             space = next_word["bbox"][0] - this_word["bbox"][2]
#             space = max(0, int(space / scale))

#             if space >= gap_threshold:
#                 # treat as new column
#                 row.append(current_cell.strip())
#                 current_cell = next_word["text"]
#             else:
#                 # same column → keep concatenating
#                 current_cell += " " + next_word["text"]

#         row.append(current_cell.strip())
#         table_rows.append(row)

#     # Normalize to rectangular DataFrame
#     max_cols = max(len(r) for r in table_rows)
#     for r in table_rows:
#         r.extend([""] * (max_cols - len(r)))

#     col_names = [f"col{i+1}" for i in range(max_cols)]
#     df = pd.DataFrame(table_rows, columns=col_names)
#     return df

# Table Utils

In [None]:
# # Layout detection (text in tables)
# def detect_table_content(img):
#   layout_model = LayoutDetection(model_name="PP-DocLayout_plus-L")
#   layout_output = layout_model.predict(img, batch_size=1, layout_nms=True)

#   table_boxes = []
#   for res in layout_output:
#       for box in res["boxes"]:
#           if box["label"] == "table":
#               coords = list(map(int, box["coordinate"]))  # [x1,y1,x2,y2]
#               table_boxes.append(tuple(coords))
#   return table_boxes

In [None]:
# --- Helper: Filter nested tables ---
def filter_nested_tables(table_boxes):
    """
    Removes tables that are fully inside larger table bounding boxes.

    Args:
        table_boxes (list of tuple): list of (x1,y1,x2,y2) table coordinates.

    Returns:
        list of tuple: filtered table boxes (no nested ones).
    """
    # Sort by area (largest first)
    table_boxes = sorted(table_boxes, key=lambda b: (b[2]-b[0]) * (b[3]-b[1]), reverse=True)

    filtered_boxes = []
    for box in table_boxes:
        x1, y1, x2, y2 = box
        inside_other = False
        for kept in filtered_boxes:
            kx1, ky1, kx2, ky2 = kept
            if x1 >= kx1 and y1 >= ky1 and x2 <= kx2 and y2 <= ky2:
                inside_other = True
                break
        if not inside_other:
            filtered_boxes.append(box)

    return filtered_boxes


# --- Main: Layout detection (text in tables) ---
def detect_table_content(img, remove_nested=False):
    layout_model = LayoutDetection(model_name="PP-DocLayout_plus-L")
    layout_output = layout_model.predict(img, batch_size=1, layout_nms=True)

    table_boxes = []
    for res in layout_output:
        for box in res["boxes"]:
            if box["label"] == "table":
                coords = list(map(int, box["coordinate"]))  # [x1,y1,x2,y2]
                table_boxes.append(tuple(coords))

    # Apply nested filtering if requested
    if remove_nested:
        table_boxes = filter_nested_tables(table_boxes)

    return table_boxes

In [None]:
def subtract_overlap(box, overlap):
    """
    Subtract overlap rectangle from box.
    Args:
        box: (x1, y1, x2, y2)
        overlap: (ox1, oy1, ox2, oy2)
    Returns:
        list of remaining boxes after subtraction
    """
    x1, y1, x2, y2 = box
    ox1, oy1, ox2, oy2 = overlap

    remaining = []

    # Top rectangle
    if oy1 > y1:
        remaining.append((x1, y1, x2, oy1))
    # Bottom rectangle
    if oy2 < y2:
        remaining.append((x1, oy2, x2, y2))
    # Left rectangle
    if ox1 > x1:
        remaining.append((x1, max(y1, oy1), ox1, min(y2, oy2)))
    # Right rectangle
    if ox2 < x2:
        remaining.append((ox2, max(y1, oy1), x2, min(y2, oy2)))

    return remaining

In [None]:
def remove_overlapped_area(table_boxes):
    """
    Removes overlapping areas but keeps the remaining parts of tables.
    Args:
        table_boxes: list of (x1,y1,x2,y2)
    Returns:
        list of boxes with overlaps removed
    """
    # Sort by area (largest first)
    table_boxes = sorted(table_boxes, key=lambda b: (b[2]-b[0])*(b[3]-b[1]), reverse=True)
    result = []

    for box in table_boxes:
        temp = [box]
        for kept in result:
            new_temp = []
            for t in temp:
                inter_x1 = max(t[0], kept[0])
                inter_y1 = max(t[1], kept[1])
                inter_x2 = min(t[2], kept[2])
                inter_y2 = min(t[3], kept[3])
                if inter_x1 < inter_x2 and inter_y1 < inter_y2:
                    # Overlap exists → subtract it
                    new_temp.extend(subtract_overlap(t, (inter_x1, inter_y1, inter_x2, inter_y2)))
                else:
                    new_temp.append(t)
            temp = new_temp
        result.extend(temp)

    return result

In [None]:
# Layout detection (rows and columns layout in tables)
from paddleocr import TableStructureRecognition

def detect_table_layout(img):
  model = TableStructureRecognition(model_name="SLANet")
  output = model.predict(input=img, batch_size=1)
  return output
# for res in output:
#     res.print(json_format=False)
#     res.save_to_json("./output/res.json")

In [None]:
# Put tables data into table layout based on
# 8 point coordinates generated by PaddleOCR and
# 4 point coordinates of words generated by DocTR

import pandas as pd
from collections import defaultdict
from itertools import groupby

# ----------------------------
# Helper functions
# ----------------------------
def quad_to_rect(quad):
    """Convert 8-point quadrilateral bbox to rectangle bbox (x0, y0, x1, y1)"""
    xs = quad[0::2]
    ys = quad[1::2]
    return (min(xs), min(ys), max(xs), max(ys))

def assign_text_to_cells(cells, text_boxes):
    """Assign each text box to the cell that contains its center"""
    assigned = defaultdict(list)
    for t in text_boxes:
        tx0, ty0, tx1, ty1 = t['bbox']
        center = ((tx0+tx1)/2, (ty0+ty1)/2)
        for c in cells:
            cx0, cy0, cx1, cy1 = c['bbox']
            if cx0 <= center[0] <= cx1 and cy0 <= center[1] <= cy1:
                assigned[c['cell_id']].append(t)
                break
    # Merge texts per cell
    cell_text = {}
    for cell_id, texts in assigned.items():
        texts.sort(key=lambda t: t['bbox'][0])  # left to right
        cell_text[cell_id] = ' '.join([t['text'] for t in texts])
    return cell_text

# ----------------------------
# Main function
# ----------------------------
def paddleocr_doctr_to_df(paddleocr_output, doctr_text_boxes, row_threshold=10):
    """
    Convert PaddleOCR + docTR output to Pandas DataFrame

    Args:
        paddleocr_output: dict, output from TableStructureRecognition
        doctr_text_boxes: list of dicts, [{'text': 'abc', 'bbox': (x0,y0,x1,y1)}]
        row_threshold: int, tolerance in pixels to separate rows

    Returns:
        pandas.DataFrame
    """
    # Step 1: Convert quadrilaterals to rects
    cells = []
    for i, quad in enumerate(paddleocr_output['bbox']):
        rect = quad_to_rect(quad)
        cells.append({
            'cell_id': i,
            'bbox': rect,
            'row_id': None,
            'col_id': None
        })

    # Step 2: Infer row indices by top y-coordinate
    cells.sort(key=lambda c: c['bbox'][1])
    current_row = 0
    last_y = -100
    for c in cells:
        y0 = c['bbox'][1]
        if y0 - last_y > row_threshold:
            current_row += 1
            last_y = y0
        c['row_id'] = current_row - 1  # 0-indexed

    # Step 3: Infer column indices per row
    df_rows = max(c['row_id'] for c in cells) + 1
    df_cols = 0
    for row_id, group in groupby(sorted(cells, key=lambda c: c['row_id']), lambda c: c['row_id']):
        group = list(group)
        group.sort(key=lambda c: c['bbox'][0])
        for col_id, c in enumerate(group):
            c['col_id'] = col_id
        df_cols = max(df_cols, len(group))

    # Step 4: Assign docTR text to cells
    cell_text = assign_text_to_cells(cells, doctr_text_boxes)

    # Step 5: Build DataFrame
    df = pd.DataFrame("", index=range(df_rows), columns=range(df_cols))
    for c in cells:
        row, col = c['row_id'], c['col_id']
        df.iat[row, col] = cell_text.get(c['cell_id'], "")

    return df


In [None]:
#  Filter DocTR OCR text outside table boxes

def inside_table(box, tables):
    x1, y1, x2, y2 = box
    for tx1, ty1, tx2, ty2 in tables:
        if x1 >= tx1 and y1 >= ty1 and x2 <= tx2 and y2 <= ty2:
            return True
    return False

In [None]:
# Extract words with boxes and scores
def extract_table_words_with_bboxes(json_ocr_result):
    words = []
    for page in json_ocr_result['pages']:
        for block in page['blocks']:
            for line in block['lines']:
                for word in line['words']:
                    text = word['value'].strip()
                    if text:  # Skip empty
                        score = word['confidence']
                        geometry = np.array(word['geometry'])  # [[x0,y0], [x1,y1]] for word bbox (quad? but often bilinear)
                        # Convert to polygon if needed (DocTR uses bilinear quads, but for simplicity, use corners)
                        poly = np.array(geometry).reshape(2, 2) * np.array([crop.shape[1], crop.shape[0]])  # Scale to image coords if normalized
                        words.append({'text': text, 'score': score, 'poly': poly})

    print(f"Detected {len(words)} words in table.")
    return words

# Section Utils

In [None]:
# Section Detection
def detect_section_type(line_text: str) -> str:
    """
    Detect if a line is a SECTION, SUBSECTION, HEADING, PARAGRAPH, or KEY.
    Rules:
    - Roman numeral + CAPS = SECTION
    - Single capital letter + CAPS = SUBSECTION
    - ALL CAPS = KEY
    - Capital + lowercase = value/paragraph
    """
    import re
    if re.match(r"^(I|II|III|IV|V|VI|VII|VIII|IX|X)\.\s+[A-Z ]+$", line_text):
        return "SECTION"
    elif re.match(r"^[A-Z]\.\s+[A-Z ]+$", line_text):
        return "SUBSECTION"
    elif line_text.isupper():
        return "KEY"
    elif re.match(r"^[A-Z][a-z]", line_text):
        return "PARAGRAPH"
    else:
        return "UNKNOWN"

In [None]:
# Key-Value Extraction
def extract_key_value_pairs(lines: List[str]) -> Dict[str, str]:
    """
    Identify keys and values.
    - All Caps word = Key
    - Next line or same line = Value
    """
    key_values = {}
    current_key = None

    for line in lines:
        if line.isupper():
            current_key = line
            key_values[current_key] = ""
        else:
            if current_key:
                key_values[current_key] += (" " + line).strip()
    return key_values

In [None]:
# Section REconstruction
def reconstruct_document(lines: List[List[Word]]) -> Dict:
    """
    Build hierarchical structure: Sections -> Subsections -> Content.
    """
    document = {}
    current_section = None
    current_subsection = None

    for line in lines:
        line_text = merge_line_text(line)
        section_type = detect_section_type(line_text)

        if section_type == "SECTION":
            current_section = line_text
            document[current_section] = {}
        elif section_type == "SUBSECTION" and current_section:
            current_subsection = line_text
            document[current_section][current_subsection] = []
        elif section_type == "KEY" and current_section:
            if current_subsection:
                document[current_section][current_subsection].append({line_text: ""})
            else:
                document[current_section][line_text] = ""
        else:
            if current_section:
                if current_subsection:
                    document[current_section][current_subsection].append(line_text)
                else:
                    document[current_section].setdefault("Content", []).append(line_text)
    return document

# Pipeline

#### 1. Load PDF and convert to Image

In [None]:
# Load and Convert
pdf_path = "/content/testscanneddocs.pdf"
pages = convert_to_images(pdf_path)

Saved: ocr_test/page0.png
Saved: ocr_test/page1.png
Saved: ocr_test/page2.png
Saved: ocr_test/page3.png
Saved: ocr_test/page4.png
Saved: ocr_test/page5.png


In [None]:
# Read Images
images_path = "/content/ocr_test"
images = os.listdir(images_path)

In [None]:
images

['page0.png', 'page2.png', 'page5.png', 'page4.png', 'page3.png', 'page1.png']

In [None]:
# Read an Image
img = os.path.join(images_path,images[3])

In [None]:
import cv2
image = cv2.imread(img)

#### 2. Run OCR on Images

In [None]:
# Run Ocr and get bounding boxes

doc = DocumentFile.from_images(img)
ocr_model = ocr_predictor(pretrained=True)
ocr_result = ocr_model(doc)

ocr_text_boxes = []
for page in ocr_result.pages:
    h, w = page.dimensions
    for block in page.blocks:
        for line in block.lines:
            for word in line.words:
                (x_min, y_min), (x_max, y_max) = word.geometry
                x_min, x_max = int(x_min * w), int(x_max * w)
                y_min, y_max = int(y_min * h), int(y_max * h)
                ocr_text_boxes.append({
                    "text": word.value,
                    "bbox": (x_min, y_min, x_max, y_max)
                })

In [None]:
# Pipeline
words = ocr_text_boxes
words = preprocess_words(words)
lines = group_words_into_lines(words)

In [None]:
print(format_lines_with_spacing(lines, scale=1))

# Detect and Extract Table Content with DocTR

### 1. Detect and Crop the table Region

In [None]:
table_boxes = detect_table_content(img, remove_nested=True)

[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/PP-DocLayout_plus-L`.[0m


In [None]:
len(table_boxes)

2

In [None]:
# Crop the table region
for idx,tb in enumerate(table_boxes):
    x1, y1, x2, y2 = tb
    crop = image[y1:y2, x1:x2]
    break

In [None]:
# Visualize the cropped area
import matplotlib.pyplot as plt
plt.imshow(crop)
plt.axis('off')
plt.show()

#### Filter the page text by removing content of table from it

In [None]:
filtered_texts = [t for t in ocr_text_boxes if not inside_table(t["bbox"], table_boxes)]
filtered_lines = group_words_into_lines(filtered_texts)

In [None]:
print(format_lines_with_spacing(filtered_lines, scale=40))

BID SLAIE DEVELOPMENT


### 2. OCR the detected Tables

In [None]:
# Load OCR Model
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import cv2
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

# Initialize the predictor (use 'db_resnet50' for detection, 'crnn_vgg16_bn' for recognition; or 'vit' for better accuracy)
model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
# For GPU: model = ocr_predictor(..., device='cuda:0')
print("DocTR model loaded.")

DocTR model loaded.


In [None]:
result = model([crop])  # Input: np.array (H, W, 3) or DocumentFile.from_images('path')
json_result = result.export()

In [None]:
table_words = extract_table_words_with_bboxes(json_result)

Detected 437 words in table.


In [None]:
table_ocr_text_boxes = []
for page in result.pages:
    h, w = page.dimensions
    for block in page.blocks:
        for line in block.lines:
            for word in line.words:
                (x_min, y_min), (x_max, y_max) = word.geometry
                x_min, x_max = int(x_min * w), int(x_max * w)
                y_min, y_max = int(y_min * h), int(y_max * h)
                table_ocr_text_boxes.append({
                    "text": word.value,
                    "bbox": (x_min, y_min, x_max, y_max)
                })

In [None]:
table_ocr_text_boxes

### 3. Detect Layout Coordinates of Table

In [None]:
# Layout detection (text in tables)
table_layout_coords = detect_table_layout(crop)[0]

[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/SLANet`.[0m


In [None]:
df = paddleocr_doctr_to_df(table_layout_coords, table_ocr_text_boxes)
df.columns = df.iloc[0]
# Drop the first row from the DataFrame
df = df.drop(0).reset_index(drop=True)

In [None]:
tbl_markdown = df.to_markdown(index=False)

In [None]:
print(tbl_markdown)

In [None]:
# filtered_texts = [t for t in ocr_text_boxes if not inside_table(t["bbox"], table_boxes)]

In [None]:
test = []
switch = True
for t in ocr_text_boxes:
  if inside_table(t["bbox"], table_boxes) and switch:
    test.append({"table":tbl_markdown})
    switch = False
  elif inside_table(t["bbox"], table_boxes) and not switch:
    continue
  else:
    test.append(t)
    switch = True
test

In [None]:
for i in test:
  if "table" in i:
    print(i["table"])

In [None]:
test_lines = group_words_and_tables_into_lines(test)

In [None]:
print(render_text_and_tables_from_ocr(test_lines, scale=100))

In [None]:
# table_words = preprocess_words(table_ocr_text_boxes)
# table_lines = group_words_into_lines(table_words)

In [None]:
# # Merge text + tables in reading order
# merged = [{"type": "text", "text": t["text"], "bbox": t["bbox"]} for t in filtered_texts]
# merged += [{"type": "table", "text": tbl_markdown, "bbox": tbl["bbox"]} for tbl in table_contents]

# # Sort top-to-bottom, left-to-right
# merged.sort(key=lambda x: (x["bbox"][1], x["bbox"][0]))

In [None]:
# merged

In [None]:
# # Print final output
# final_output = []
# for item in merged:
#     if item["type"] == "text":
#         final_output.append(item["text"])
#     else:
#         final_output.append("\n" + item["text"] + "\n")

# print(" ".join(final_output))