# Text extraction from pdf

- Headings kamen von VLM (Claude)
- Code wurde nur kopiert aus Google Colab, aber noch nicht überbearbeitet

# OCR

In [None]:
!sudo apt-get install -y tesseract-ocr
!sudo apt-get install -y tesseract-ocr-eng
!sudo apt-get install -y tesseract-ocr-deu
!sudo apt-get install -y poppler-utils

In [None]:
from pdf2image import convert_from_path
from PIL import Image
import cv2
import numpy as np
import pytesseract
from typing import List, Dict
from tqdm import tqdm
class OCR:
    def __init__(self):
        """
        Initialize OCR pipeline for processing manuals in English and German
        """
        # Verify language support
        available_langs = pytesseract.get_languages()
        required_langs = ['eng', 'deu']
        missing_langs = [lang for lang in required_langs if lang not in available_langs]

        if missing_langs:
            raise RuntimeError(
                f"Missing required language packs: {missing_langs}. "
                "Please install them using:\n"
                "Linux: sudo apt-get install tesseract-ocr-[lang]\n"
                "Windows: Download .traineddata files to tessdata directory"
            )

    def preprocess_image(self, image: np.ndarray) -> np.ndarray:
        """
        Preprocess image optimized for manual/document text
        """
        # Convert to grayscale if needed
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image

        # Apply adaptive thresholding - works better for text documents
        thresh = cv2.adaptiveThreshold(
            gray,
            255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY,
            11,
            2
        )

        # Denoise
        denoised = cv2.fastNlMeansDenoising(thresh)

        return denoised

    def process_page(self, pil_image: Image.Image, i, title) -> Dict:
        """
        Process a single page, optimized for manual text
        """
        # Convert PIL to CV2 for preprocessing
        cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

        # Preprocess
        processed = self.preprocess_image(cv_image)

        # OCR Configuration for manual text
        custom_config = r'--psm 3 -l eng+deu'

        # Get text
        text = pytesseract.image_to_string(
            processed,
            config=custom_config
        ).strip()

        return {
            'image': pil_image,
            'title': title,
            'text': text,
            'page_number': i+1
        }

    def process_pdf(self, pdf_path: str) -> List[Dict]:
        """
        Process PDF and return list of dictionaries containing PIL image and text

        Returns:
            List of dictionaries with format:
            {
                'image': PIL.Image,
                'text': str
            }
        """
        try:
            # Convert PDF to images with higher DPI for better OCR
            pages = convert_from_path(
                pdf_path,
                dpi=200,
                fmt='pil'  # Ensure PIL format
            )

            # Process each page
            results = []
            for i, page in tqdm(enumerate(pages), desc = "Processing pages", total = len(pages)):
                result = self.process_page(page, i, pdf_path[:-4])
                results.append(result)


            return results

        except Exception as e:
            raise RuntimeError(f"Error processing PDF {pdf_path}: {str(e)}")


# Markdown

- OCR, Layout Analysis und Markdown Extraction mit Dockling
- Kann auf CPU oder GPU laufen

In [None]:
!pip install -q docling
!sudo apt-get install poppler-utils -y
!pip install -q pdf2image
!pip install -q -U  PyPDF2

Man kann auch dockling mit tesseract konifgurieren. Default ist easy-ocr

In [None]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

model = StandardPdfPipeline.download_models_hf()
# model = "/local/path/to/artifacts"

pipeline_options = PdfPipelineOptions(artifacts_path=model)
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

- split each page of pdf to own pdf
- extract markdown from page pdf file

In [None]:
import os
import re
from PyPDF2 import PdfReader, PdfWriter

def natural_sort_key(s):
    """
    Create a key for natural sorting of strings containing numbers.

    Args:
        s (str): Input string

    Returns:
        list: List of components for natural sorting
    """
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split('([0-9]+)', s)]

def split_pdf_with_paths(input_path, output_folder):
    """
    Split a PDF file into individual pages, save each page as a separate PDF,
    and return sorted paths of generated files.

    Args:
        input_path (str): Path to the input PDF file
        output_folder (str): Directory where individual pages will be saved

    Returns:
        list: Sorted list of paths to generated PDF files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # List to store generated file paths
    generated_paths = []

    # Open the PDF file
    try:
        pdf = PdfReader(input_path)
    except Exception as e:
        print(f"Error opening PDF file: {e}")
        return generated_paths

    # Get the total number of pages
    total_pages = len(pdf.pages)

    # Extract the filename without extension
    base_filename = os.path.splitext(os.path.basename(input_path))[0]

    # Process each page
    for page_num in range(total_pages):
        # Create a PDF writer object
        pdf_writer = PdfWriter()

        # Add the current page
        pdf_writer.add_page(pdf.pages[page_num])

        # Generate output filename
        output_filename = f"{base_filename}_page_{page_num + 1}.pdf"
        output_path = os.path.join(output_folder, output_filename)

        # Save the page to a new PDF
        try:
            with open(output_path, 'wb') as output_file:
                pdf_writer.write(output_file)
            generated_paths.append(output_path)
        except Exception as e:
            print(f"Error saving page {page_num + 1}: {e}")

    # Sort the paths naturally
    generated_paths.sort(key=natural_sort_key)
    return generated_paths

def process_directory(directory):
    """
    Process all PDFs in a directory, split them, and return all generated paths.

    Args:
        directory (str): Directory containing PDF files

    Returns:
        list: Sorted list of all generated PDF paths
    """
    # Get all PDF files in directory
    pdf_files = [f for f in os.listdir(directory) if f.endswith('.pdf')]
    all_generated_paths = []

    # Process each PDF file
    for pdf_file in pdf_files:
        input_path = os.path.join(directory, pdf_file)
        output_folder = os.path.join(directory, 'split_pdfs')
        paths = split_pdf_with_paths(input_path, output_folder)
        all_generated_paths.extend(paths)

    # Sort all paths naturally
    all_generated_paths.sort(key=natural_sort_key)
    return all_generated_paths

In [None]:
batch_size = 16
# list of dicts with pdf as title
pdfs = list({"title": item} for item in list(set(item["title"] for item in data)))

all_pages = list()

for pdf in pdfs:
    title = pdf["title"]
    print(title)
    path = f"/content/driv#e/MyDrive/Pages/{title}"
    source = f"/content/drive/MyDrive/MiR200/{title}.pdf"
    paths = split_pdf_with_paths(source, path)
    pages = list()
    for p in tqdm(paths, total=len(paths)):
        page = dict()
        markdown = converter.convert(p).document.export_to_markdown()
        # replace image placeholder, etc
        markdown_trans = markdown.replace(" l ", " ").replace("<!-- image -->", "").replace("\n\n\n", "")
        i = 1
        page["title"] = title
        page["page_number"] = p.split("_")[-1].split(".")[0]
        page["text"] = markdown_trans
        pages.append(page)
    all_pages.extend(pages)