In [1]:
!pip install pymupdf pytesseract pillow sentence-transformers hdbscan umap-learn matplotlib




In [None]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
from sentence_transformers import SentenceTransformer
import hdbscan
import umap
import numpy as np
import matplotlib.pyplot as plt
import re

# --- Step 1: Extract text with OCR fallback ---
def extract_text_from_page(page):
    text = page.get_text().strip()
    if len(text) < 20:  # If text is too short, try OCR
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        text = pytesseract.image_to_string(img)
    return text

# --- Step 2: Extract additional features from text ---
def extract_features(text):
    # Basic pattern features
    word_count = len(text.split())
    has_title = 1 if re.search(r'^[A-Z][A-Za-z\s]{3,}', text.split('\n')[0]) else 0
    has_date = 1 if re.search(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', text) else 0
    has_signature = 1 if re.search(r'(Sincerely|Regards|Best regards|Thank you)', text, re.I) else 0
    return np.array([word_count, has_title, has_date, has_signature])

# --- Main processing function ---
def process_pdf(pdf_path):
    # Load model for embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Open PDF
    pdf_document = fitz.open(pdf_path)

    page_texts = []
    pattern_features = []

    print("Extracting text and features from pages...")
    for i in range(len(pdf_document)):
        page = pdf_document[i]
        text = extract_text_from_page(page)
        page_texts.append(text)
        pattern_features.append(extract_features(text))

    # Get embeddings for all page texts
    print("Computing text embeddings...")
    embeddings = model.encode(page_texts)

    # Normalize pattern features and scale down to balance with embeddings
    pattern_features = np.array(pattern_features)
    pattern_features = (pattern_features - pattern_features.mean(axis=0)) / (pattern_features.std(axis=0) + 1e-6)
    pattern_features_scaled = pattern_features * 0.5  # scale factor to balance features and embeddings

    # Combine embeddings and pattern features
    combined_features = np.hstack([embeddings, pattern_features_scaled])

    # Cluster pages with HDBSCAN
    print("Clustering pages...")
    clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric='euclidean')
    clusters = clusterer.fit_predict(combined_features)

    # Merge consecutive pages with same cluster label into documents
    docs = []
    current_doc = []
    current_cluster = clusters[0]

    for idx, cluster_id in enumerate(clusters):
        if cluster_id == current_cluster:
            current_doc.append(idx)
        else:
            docs.append(current_doc)
            current_doc = [idx]
            current_cluster = cluster_id
    docs.append(current_doc)

    # Print detected documents
    print("\nDetected documents and their page ranges:")
    for i, doc_pages in enumerate(docs):
        print(f"Document {i+1}: pages {doc_pages}")

    # Visualization with UMAP
    print("\nVisualizing clusters with UMAP...")
    reducer = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='euclidean', random_state=42)
    embedding_2d = reducer.fit_transform(combined_features)

    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], c=clusters, cmap='tab20', s=60)
    plt.colorbar(scatter, label='Cluster ID')
    plt.title("Page Clusters Visualization (UMAP)")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.show()

    return docs, clusters, page_texts

# --- Run on your PDF file ---
if __name__ == "__main__":
    pdf_path = r"C:\Users\PawanMagapalli\Downloads\document\Doc-Classification\merged doc.pdf"  # Replace with your PDF path
    detected_docs, cluster_labels, texts = process_pdf(pdf_path)


Extracting text and features from pages...


PatternError: incomplete escape \U at position 2