In [4]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from typing import Dict, List, Tuple
from doctr.io import DocumentFile
from doctr.models import ocr_predictor

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class docTRParser:
    def __init__(self):
        print("Loading docTR models...")
        # basic configuration
        self.model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
        print("Models loaded successfully!")

    def extract_mixed_content(self, file_path: str, output_md_file: str = "output.md"):
        """
        Extracts content, converting tables to HTML and keeping text as Markdown.
        """
        # 1. Load Document
        if file_path.lower().endswith('.pdf'):
            doc = DocumentFile.from_pdf(file_path)
        else:
            doc = DocumentFile.from_images([file_path])

        # 2. Run OCR
        print("Running OCR...")
        result = self.model(doc)
        
        # 3. Process & Generate Output
        full_markdown = []
        
        for page_idx, page in enumerate(result.pages):
            print(f"Processing Page {page_idx + 1}...")
            full_markdown.append(f"## Page {page_idx + 1}\n")
            
            # Analyze page to split into "Table Blocks" and "Text Blocks"
            page_content = self._process_page_smartly(page)
            full_markdown.append(page_content)
            full_markdown.append("\n---\n")

        # 4. Save
        with open(output_md_file, "w", encoding="utf-8") as f:
            f.write("\n".join(full_markdown))
        print(f"✅ Saved smart report to {output_md_file}")

    def _process_page_smartly(self, page) -> str:
        """
        Intelligently separates tables from text based on column density.
        """
        # A. Extract all words with coordinates
        words_data = []
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:
                    (min_x, min_y), (max_x, max_y) = word.geometry
                    center_y = (min_y + max_y) / 2
                    center_x = (min_x + max_x) / 2
                    words_data.append({
                        'text': word.value,
                        'y': center_y,
                        'x': center_x,
                        'min_x': min_x, # Used for sorting
                        'row_h': max_y - min_y
                    })

        if not words_data:
            return ""

        df = pd.DataFrame(words_data)

        # B. Cluster into ROWS (Y-axis)
        # 1.5% height tolerance to group words on the same line
        y_clustering = DBSCAN(eps=0.007, min_samples=1).fit(df[['y']])
        df['row_id'] = y_clustering.labels_

        # Calculate properties for each row
        row_stats = []
        for rid, group in df.groupby('row_id'):
            # Cluster X-coordinates to count "columns" in this specific row
            # If items are far apart (>5% width), they are separate columns
            x_clustering = DBSCAN(eps=0.03, min_samples=1).fit(group[['x']])
            num_cols = len(set(x_clustering.labels_))
            avg_y = group['y'].mean()
            
            row_stats.append({
                'row_id': rid,
                'avg_y': avg_y,
                'num_cols': num_cols,
                'words': group
            })
        
        # Sort rows top to bottom
        row_stats.sort(key=lambda x: x['avg_y'])

        # C. Group consecutive rows into "Content Blocks"
        # If a row has >= 2 columns, it's a TABLE row.
        # If a row has 1 column, it's a TEXT row.
        blocks = []
        current_block = {'type': None, 'rows': []}

        for row in row_stats:
            # HEURISTIC: A row is "Table-like" if it has 2+ distinct columns
            is_table_row = row['num_cols'] >= 2
            row_type = 'table' if is_table_row else 'text'

            # Start new block if type changes
            if row_type != current_block['type']:
                if current_block['rows']:
                    blocks.append(current_block)
                current_block = {'type': row_type, 'rows': [row]}
            else:
                current_block['rows'].append(row)
        
        # Append final block
        if current_block['rows']:
            blocks.append(current_block)

        # D. Render Blocks
        output_str = []
        for block in blocks:
            if block['type'] == 'table':
                # Convert this specific block of rows to HTML
                html = self._rows_to_html(block['rows'])
                output_str.append(html)
            else:
                # Convert this specific block of rows to Text
                text = self._rows_to_text(block['rows'])
                output_str.append(text)
                
        return "\n\n".join(output_str)

    def _rows_to_html(self, rows_list) -> str:
        """Generates HTML table from a list of 'row' objects."""
        # We need to re-cluster columns GLOBALLY for this specific table block
        # to ensure alignment (e.g., column 2 in row 1 aligns with column 2 in row 5)
        
        all_words_in_table = pd.concat([r['words'] for r in rows_list])
        
        # Cluster X-coordinates to define global columns for this table
        x_clustering = DBSCAN(eps=0.05, min_samples=1).fit(all_words_in_table[['x']])
        all_words_in_table['col_id'] = x_clustering.labels_
        
        # Sort columns left-to-right
        col_map = all_words_in_table.groupby('col_id')['x'].mean().sort_values().reset_index()
        col_map['sorted_col_id'] = range(len(col_map))
        all_words_in_table = all_words_in_table.merge(col_map[['col_id', 'sorted_col_id']], on='col_id')

        # Map back to specific rows (using the original row sort order)
        # Create a mapping of original row_ids to a sorted sequence 0, 1, 2...
        sorted_row_ids = [r['row_id'] for r in rows_list]
        row_map = {rid: i for i, rid in enumerate(sorted_row_ids)}
        all_words_in_table['sorted_row_id'] = all_words_in_table['row_id'].map(row_map)

        # Pivot to create grid
        grid = all_words_in_table.groupby(['sorted_row_id', 'sorted_col_id'])['text'] \
            .apply(lambda x: " ".join(x)).unstack()
        
        # Fill empty cells
        grid = grid.fillna("")
        
        return grid.to_html(index=False, header=False, border=1)

    def _rows_to_text(self, rows_list) -> str:
        """Joins text rows simply."""
        lines = []
        for r in rows_list:
            # Sort words in the line by X coordinate
            words = r['words'].sort_values('x')
            line_text = " ".join(words['text'])
            lines.append(line_text)
        return "\n".join(lines)

In [15]:
# --- EXECUTION ---
if __name__ == "__main__":
    processor = docTRParser()
    # Replace with your actual file path
    processor.extract_mixed_content("/Users/santusahoo/Documents/DAGENT/CRPL-1N60001074-CADPO110494.pdf", output_md_file="ins_po_5.md")

Loading docTR models...
Models loaded successfully!
Running OCR...
Processing Page 1...
Processing Page 2...
Processing Page 3...
Processing Page 4...
✅ Saved smart report to ins_po_5.md
