In [None]:
import cv2
from doclayout_yolo import YOLOv10


from huggingface_hub import hf_hub_download

filepath = hf_hub_download(repo_id="juliozhao/DocLayout-YOLO-DocStructBench", filename="doclayout_yolo_docstructbench_imgsz1024.pt")

model = YOLOv10(filepath)


# Download and initialize the model
filepath = hf_hub_download(
    repo_id="juliozhao/DocLayout-YOLO-DocStructBench", 
    filename="doclayout_yolo_docstructbench_imgsz1024.pt"
)
model = YOLOv10(filepath)

def convert_pdf_to_images(pdf_path):
    """Convert PDF to images using pdf2image"""
    return pdf2image.convert_from_path(pdf_path)

def process_pdf_directory(pdf_dir, output_dir):
    """Process all PDFs in a directory"""
    pdf_dir = Path(pdf_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    
    for pdf_file in pdf_dir.glob("*.pdf"):
        # Create output subdirectory for this PDF
        pdf_output_dir = output_dir / pdf_file.stem
        pdf_output_dir.mkdir(exist_ok=True)
        
        # Convert PDF to images
        images = convert_pdf_to_images(pdf_file)
        
        # Process each page
        for i, image in enumerate(images):
            # Convert PIL Image to OpenCV format
            opencv_image = cv2.cvtColor(numpy.array(image), cv2.COLOR_RGB2BGR)
            
            # Save the intermediate image
            image_path = pdf_output_dir / f"page_{i+1}.jpg"
            cv2.imwrite(str(image_path), opencv_image)
            
            # Perform layout detection
            det_res = model.predict(
                str(image_path),
                imgsz=1024,
                conf=0.2,
                device="cuda:0" if torch.cuda.is_available() else "cpu"
            )
            
            # Save annotated result
            annotated_frame = det_res[0].plot(pil=True, line_width=5, font_size=20)
            result_path = pdf_output_dir / f"page_{i+1}_annotated.jpg"
            cv2.imwrite(str(result_path), annotated_frame)

# Usage
pdf_directory = "data"  # Your PDF directory
output_directory = "output"  # Where to save results
process_pdf_directory(pdf_directory, output_directory)

In [None]:
from markitdown import MarkItDown
from doclayout_yolo import YOLOv10
from huggingface_hub import hf_hub_download
import json
from pathlib import Path
import fitz
import numpy as np
import cv2
import pdf2image
import numpy
import torch
import tempfile



def convert_pdf_to_images(pdf_path):
    """Convert PDF to images using pdf2image"""
    return pdf2image.convert_from_path(pdf_path)

def select_boxes(det_res, image):
    boxes = det_res[0].boxes
    for _, box in enumerate(boxes):
        # Get box coordinates
        x, y, w, h = map(int, box.xywh[0].tolist())
        cls = int(box.cls[0])
        cls_name = det_res[0].names[cls]
        print(cls_name)

        if cls_name not in ['figure', 'formula', 'table']:
            continue
        conf = float(box.conf[0])
        
        # Calculate crop coordinates
        x1 = max(0, x - w//2)
        y1 = max(0, y - h//2)
        x2 = min(image.shape[1], x + w//2)
        y2 = min(image.shape[0], y + h//2)
        
        # Crop the image
        cropped = image[int(y1):int(y2), int(x1):int(x2)]
        yield cropped, cls_name, conf, box


def process_pdf_directory(pdf_dir, output_dir):
    """Process all PDFs in a directory"""
    pdf_dir = Path(pdf_dir)
    print(pdf_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    
    # Initialize MarkItDown
    md = MarkItDown()
    
    for pdf_file in pdf_dir.glob("*.pdf"):
        print(pdf_file)
        pdf_output_dir = output_dir / pdf_file.stem
        pdf_output_dir.mkdir(exist_ok=True)
        
        # Open PDF with PyMuPDF for page handling
        pdf_document = fitz.open(pdf_file)
        
        # Create temporary directory for single pages
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            
            # Convert PDF to images for YOLO processing
            images = convert_pdf_to_images(pdf_file)
            
            # Process each page
            for i, (image, pdf_page) in enumerate(zip(images, pdf_document)):
                # Create single page PDF for MarkItDown
                new_pdf = fitz.open()
                new_pdf.insert_pdf(pdf_document, from_page=i, to_page=i)
                temp_pdf_path = temp_path / f"page_{i + 1}.pdf"
                new_pdf.save(temp_pdf_path)
                
                # Extract text using MarkItDown for this page
                try:
                    md_result = md.convert(str(temp_pdf_path))
                    page_text = md_result.text_content
                except Exception as e:
                    print(f"Error extracting text from page {i + 1}: {str(e)}")
                    page_text = ""
                
                new_pdf.close()
                
                # Process image with YOLO
                opencv_image = cv2.cvtColor(numpy.array(image), cv2.COLOR_RGB2BGR)
                image_path = pdf_output_dir / f"page_{i+1}.jpg"
                cv2.imwrite(str(image_path), opencv_image)
                
                det_res = model.predict(
                    str(image_path),
                    imgsz=1024,
                    conf=0.2,
                    device="cuda:0" if torch.cuda.is_available() else "cpu"
                )
                
                # Process detection results
                page_data = {
                    # List to store all detected regions on the page
                    "regions": [],
                    # Save markdown text file and store path
                    "markdown_path": str(pdf_output_dir / f"page_{i+1}.md")
                }
                
                # Write markdown file
                with open(page_data["markdown_path"], "w", encoding="utf-8") as f:
                    f.write(page_text.strip() if page_text else "")
                # Get boxes and their classes

               

                cropped_images = list(select_boxes(det_res, opencv_image))
                
                for cropped, cls_name, conf, box in cropped_images:
                    region_path = pdf_output_dir / f"page_{i+1}_{cls_name}_{len(page_data['regions'])}.jpg"
                    cv2.imwrite(str(region_path), cropped)
                    region_data = {
                            "type": cls_name,
                            "confidence": conf,
                            "bbox": box.xywh[0].tolist(),
                            "region_image_path": str(region_path)
                        }
                    page_data["regions"].append(region_data)


                # for box in boxes:
                #     bbox = box.xywh[0].tolist()
                #     conf = float(box.conf[0])
                #     cls = int(box.cls[0])
                #     cls_name = det_res[0].names[cls]
                    
                    # Only extract regions for specific types
                    # # if cls_name in ['figure', 'table', 'isolate_formula']:
                    #     region_img = opencv_image.copy()
                    #     x, y, w, h = map(int, bbox)
                    #     region_img = cv2.rectangle(region_img, (x, y), (x + w, y + h), (0, 255, 0), 2)
                        
                       
                        # else:
                        #     region_data = {
                        #         "type": cls_name,
                        #         "confidence": conf,
                        #         "bbox": bbox
                        #     }
                    
                   
                
                # Save JSON for this page
                json_path = pdf_output_dir / f"page_{i+1}.json"
                with open(json_path, 'w', encoding='utf-8') as f:
                    json.dump(page_data, f, ensure_ascii=False, indent=2)
                
                # Save annotated image
                annotated_frame = det_res[0].plot(pil=True, line_width=5, font_size=20)
                result_path = pdf_output_dir / f"page_{i+1}_annotated.jpg"
                # cv2.imwrite(str(result_path), annotated_frame)
            
        pdf_document.close()

# Initialize model
filepath = hf_hub_download(
    repo_id="juliozhao/DocLayout-YOLO-DocStructBench", 
    filename="doclayout_yolo_docstructbench_imgsz1024.pt"
)
model = YOLOv10(filepath)

# Usage
pdf_directory = "../data/pdfs/"
output_directory = "../data/output_pdfs/output_v5"
process_pdf_directory(pdf_directory, output_directory)
