In [None]:
# Install required packages
!pip install pymupdf easyocr opencv-python numpy scikit-learn sentence-transformers



In [None]:
# Install required packages
!pip install pymupdf easyocr

import fitz
import easyocr
import tempfile
import os
import re
from collections import defaultdict

reader = easyocr.Reader(['en'], gpu=True)

def extract_text_focused(pdf_path):
    """Extract text - simple and focused"""
    doc = fitz.open(pdf_path)
    page_data = []

    print(f"🔍 Analyzing {len(doc)} pages - step by step approach...")

    for page_num in range(len(doc)):
        page = doc[page_num]
        native_text = page.get_text()
        pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
        img_path = tempfile.mktemp(suffix='.png')
        pix.save(img_path)

        ocr_result = reader.readtext(img_path, detail=True)
        img_height = pix.height
        top_threshold = img_height * 0.2

        top_text = ""
        full_text = ""

        for (bbox, text, conf) in ocr_result:
            full_text += text + " "
            y_coord = bbox[0][1]
            if y_coord <= top_threshold:
                top_text += text + " "

        best_text = native_text if len(native_text) > 50 else full_text

        page_data.append({
            'page_num': page_num,
            'top_text': top_text.strip(),
            'full_text': best_text.strip(),
        })

        print(f"📄 Page {page_num + 1}")
        os.remove(img_path)

    doc.close()
    return page_data

def step1_group_by_type(page_data):
    """STEP 1: Just group pages by their obvious type"""
    print("\n📋 STEP 1: Grouping pages by type...")

    groups = {
        'schedules': [],
        'articles': [],
        'numbered_pages': [],
        'other': []
    }

    for data in page_data:
        page_num = data['page_num']
        full_text = data['full_text'].upper()

        # Simple grouping logic
        if 'SCHEDULE' in full_text:
            groups['schedules'].append(data)
        elif 'ARTICLE' in full_text:
            groups['articles'].append(data)
        elif re.search(r'-\s*\d+\s*-', data['top_text']) or re.search(r'-\s*\d+\s*-', data['full_text']):
            groups['numbered_pages'].append(data)
        else:
            groups['other'].append(data)

    # Show what we found
    for group_name, pages in groups.items():
        print(f"   {group_name}: {len(pages)} pages")
        for page in pages[:3]:  # Show first 3
            preview = page['full_text'][:50].replace('\n', ' ')
            print(f"      Page {page['page_num']+1}: {preview}...")

    return groups

def step2_sort_within_groups(groups):
    """STEP 2: Sort pages within each group"""
    print("\n🔢 STEP 2: Sorting within each group...")

    def extract_number(text, patterns):
        """Helper to extract numbers from text"""
        for pattern in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                match = matches[0]
                if match.isdigit():
                    return int(match)
                # Simple roman numeral handling
                roman_map = {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6, 'VII': 7}
                return roman_map.get(match.upper(), 999)
        return 999

    sorted_groups = {}

    # Sort numbered pages by dash numbers
    numbered = groups['numbered_pages']
    for page in numbered:
        dash_patterns = [r'-\s*(\d+)\s*-']
        page['sort_key'] = extract_number(page['top_text'] + ' ' + page['full_text'], dash_patterns)
    numbered.sort(key=lambda x: x['sort_key'])
    sorted_groups['numbered_pages'] = numbered
    print(f"   ✓ Numbered pages sorted by dash numbers")

    # Sort articles by article numbers
    articles = groups['articles']
    for page in articles:
        article_patterns = [r'ARTICLE[:\s]*([IVXLCDM\d]+)', r'ARTICLE[:\-\s]*([IVXLCDM\d]+)']
        page['sort_key'] = extract_number(page['full_text'], article_patterns)
    articles.sort(key=lambda x: x['sort_key'])
    sorted_groups['articles'] = articles
    print(f"   ✓ Articles sorted by article numbers")

    # Sort schedules (they were already good!)
    schedules = groups['schedules']
    for page in schedules:
        schedule_patterns = [r'SCHEDULE[:\s]*([IVXLCDM\d]+)', r'-\s*(\d+)\s*-']
        page['sort_key'] = extract_number(page['top_text'] + ' ' + page['full_text'], schedule_patterns)
    schedules.sort(key=lambda x: x['sort_key'])
    sorted_groups['schedules'] = schedules
    print(f"   ✓ Schedules sorted (keeping what works!)")

    # Other pages - just keep original order
    sorted_groups['other'] = groups['other']
    print(f"   ✓ Other pages kept in original order")

    return sorted_groups

def step3_arrange_groups(sorted_groups):
    """STEP 3: Arrange the groups in logical order"""
    print("\n🎯 STEP 3: Arranging groups in document order...")

    # Typical document flow: numbered pages → articles → schedules → other
    group_order = ['numbered_pages', 'articles', 'schedules', 'other']

    final_order = []

    for group_name in group_order:
        pages = sorted_groups.get(group_name, [])
        if pages:
            print(f"   Adding {len(pages)} {group_name}")
            for page in pages:
                final_order.append(page['page_num'])
                sort_key = page.get('sort_key', '?')
                preview = page['full_text'][:40].replace('\n', ' ')
                print(f"      Page {page['page_num']+1} (sort: {sort_key}): {preview}...")

    return final_order

def step_by_step_reorder(input_pdf_path, output_pdf_path):
    """Main function - step by step approach"""
    print("🚀 Starting STEP-BY-STEP PDF reordering...")

    # Extract text
    page_data = extract_text_focused(input_pdf_path)

    # Step 1: Group by type
    groups = step1_group_by_type(page_data)

    # Step 2: Sort within groups
    sorted_groups = step2_sort_within_groups(groups)

    # Step 3: Arrange groups
    final_order = step3_arrange_groups(sorted_groups)

    # Create reordered PDF
    doc = fitz.open(input_pdf_path)
    reordered_doc = fitz.open()

    print("\n📄 Creating final PDF...")
    for new_pos, orig_idx in enumerate(final_order):
        reordered_doc.insert_pdf(doc, from_page=orig_idx, to_page=orig_idx)
        print(f"   Position {new_pos + 1} ← Original Page {orig_idx + 1}")

    reordered_doc.save(output_pdf_path)
    doc.close()
    reordered_doc.close()

    print(f"\n🎉 Step-by-step reordering complete: {output_pdf_path}")

    return final_order, sorted_groups

# USAGE - Step by step approach
input_pdf = '/content/jumbled.pdf'
output_pdf = '/content/reordered_step_by_step.pdf'

try:
    final_order, groups = step_by_step_reorder(input_pdf, output_pdf)

    print("\n" + "="*50)
    print("🧩 STEP-BY-STEP SUMMARY")
    print("="*50)
    print(f"📁 Result: {output_pdf}")
    print(f"🔄 Final order: {[x+1 for x in final_order]}")

    print("\n🎯 What we did:")
    print("   1. Grouped pages by type (schedules, articles, etc.)")
    print("   2. Sorted within each group by their numbers")
    print("   3. Arranged groups in logical document order")

    print("\n📊 Group breakdown:")
    for group_name, pages in groups.items():
        print(f"   {group_name}: {len(pages)} pages")

except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()


🚀 Starting STEP-BY-STEP PDF reordering...
🔍 Analyzing 25 pages - step by step approach...
📄 Page 1
📄 Page 2
📄 Page 3
📄 Page 4
📄 Page 5
📄 Page 6
📄 Page 7
📄 Page 8
📄 Page 9
📄 Page 10
📄 Page 11
📄 Page 12
📄 Page 13
📄 Page 14
📄 Page 15
📄 Page 16
📄 Page 17
📄 Page 18
📄 Page 19
📄 Page 20
📄 Page 21
📄 Page 22
📄 Page 23
📄 Page 24
📄 Page 25

📋 STEP 1: Grouping pages by type...
   schedules: 12 pages
      Page 4: LOANAGREEMENT No. Dated (Project No: Borrower Ms. ...
      Page 8: 22- ARTICLE : ! DEFINTIONS GENERAL CONDITIONS 1.1 ...
      Page 9: 5- In the event of the Borrower, failing to pay th...
   articles: 2 pages
      Page 1: 7- ARTICLE - IV APPOINTMENT QE NOMNEE DIRECTORS Th...
      Page 6: -9 ARTICLE VI EFFECTIVE DATE QE AGREEMENTL PLACE Q...
   numbered_pages: 6 pages
      Page 7: -16- viii) The Borrower  shall at its own cost kee...
      Page 15: -21 - D) CONDITIONS APPLICABLE TO LOANS_DISBURSED ...
      Page 17: -19- xxxiv) The Borrower agrees and undertakes tha...
   other: 5 pag

In [None]:
# Enhanced detector for tricky page numbers

import fitz
import easyocr
import tempfile
import os
import re
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter

# Initialize EasyOCR
reader = easyocr.Reader(['en'], gpu=True)

def enhanced_page_detection(pdf_path):
    """Enhanced detection with multiple OCR strategies"""
    doc = fitz.open(pdf_path)
    all_pages_detailed = []

    print(f"🔍 ENHANCED DETECTION on {len(doc)} pages...")
    print("=" * 80)

    for page_num in range(len(doc)):
        page = doc[page_num]

        print(f"\n📄 ANALYZING PDF Position {page_num + 1}:")
        print("-" * 50)

        # Strategy 1: Native PDF text extraction
        native_text = page.get_text()
        native_pages = find_page_numbers_comprehensive(native_text, "NATIVE")

        # Strategy 2: Standard OCR
        standard_pages = []
        try:
            pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
            img_path = tempfile.mktemp(suffix='.png')
            pix.save(img_path)

            ocr_results = reader.readtext(img_path, detail=True)
            ocr_text = ' '.join([text for _, text, conf in ocr_results if conf > 0.3])
            standard_pages = find_page_numbers_comprehensive(ocr_text, "STANDARD_OCR")

            os.remove(img_path)
        except Exception as e:
            print(f"   ⚠️ Standard OCR failed: {e}")

        # Strategy 3: Enhanced OCR (higher resolution + preprocessing)
        enhanced_pages = []
        try:
            # Higher resolution
            pix = page.get_pixmap(matrix=fitz.Matrix(6, 6))  # 6x instead of 3x
            img_path = tempfile.mktemp(suffix='.png')
            pix.save(img_path)

            # Preprocess image for better OCR
            img = Image.open(img_path)

            # Enhance contrast
            enhancer = ImageEnhance.Contrast(img)
            img = enhancer.enhance(2.0)

            # Convert to grayscale and apply sharpening
            img = img.convert('L')
            img = img.filter(ImageFilter.SHARPEN)

            # Save preprocessed image
            enhanced_path = tempfile.mktemp(suffix='.png')
            img.save(enhanced_path)

            # OCR with different settings
            ocr_results = reader.readtext(enhanced_path, detail=True, width_ths=0.3, height_ths=0.3)
            enhanced_text = ' '.join([text for _, text, conf in ocr_results if conf > 0.2])
            enhanced_pages = find_page_numbers_comprehensive(enhanced_text, "ENHANCED_OCR")

            os.remove(img_path)
            os.remove(enhanced_path)
        except Exception as e:
            print(f"   ⚠️ Enhanced OCR failed: {e}")

        # Strategy 4: Region-specific OCR (focus on header areas)
        region_pages = []
        try:
            # Focus on top 25% of page
            pix = page.get_pixmap(matrix=fitz.Matrix(4, 4))
            img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)

            # Extract header region
            header_height = int(pix.height * 0.25)
            header_region = img_array[:header_height, :, :]

            # Save header region
            header_img = Image.fromarray(header_region)
            header_path = tempfile.mktemp(suffix='.png')
            header_img.save(header_path)

            # OCR on header only
            header_results = reader.readtext(header_path, detail=True)
            header_text = ' '.join([text for _, text, conf in header_results if conf > 0.2])
            region_pages = find_page_numbers_comprehensive(header_text, "HEADER_REGION")

            os.remove(header_path)
        except Exception as e:
            print(f"   ⚠️ Region OCR failed: {e}")

        # Strategy 5: Raw character detection (for very simple cases)
        raw_pages = []
        try:
            # Look for simple patterns in raw text
            all_raw_text = native_text + " " + ocr_text + " " + enhanced_text + " " + header_text
            raw_pages = find_simple_dash_patterns(all_raw_text)
        except:
            pass

        # Combine all findings
        all_found_pages = native_pages + standard_pages + enhanced_pages + region_pages + raw_pages

        # Remove duplicates and sort by confidence
        unique_pages = {}
        for page_info in all_found_pages:
            page_num_found = page_info['value']
            if page_num_found not in unique_pages or page_info['confidence'] > unique_pages[page_num_found]['confidence']:
                unique_pages[page_num_found] = page_info

        final_pages = list(unique_pages.values())
        final_pages.sort(key=lambda x: x['confidence'], reverse=True)

        # Show results
        if final_pages:
            print(f"   ✅ FOUND PAGE NUMBERS: {[p['value'] for p in final_pages]}")
            for p in final_pages:
                print(f"      -{p['value']}- (Confidence: {p['confidence']:.1f}, Method: {p['method']})")
        else:
            print(f"   ❌ NO PAGE NUMBERS DETECTED")

        # Preview text
        preview_text = (native_text + " " + ocr_text)[:200].replace('\n', ' ').strip()
        print(f"   📝 Text Preview: {preview_text}...")

        page_info = {
            'pdf_position': page_num,
            'found_pages': final_pages,
            'best_page': final_pages[0] if final_pages else None,
            'text_preview': preview_text
        }

        all_pages_detailed.append(page_info)

    doc.close()
    return all_pages_detailed

def find_page_numbers_comprehensive(text, method_name):
    """Find page numbers with multiple pattern variations"""
    found = []

    if not text:
        return found

    # Pattern 1: Standard dash format with various spacing
    patterns = [
        r'-\s*(\d+)\s*-',           # -2- or - 2 - or -2 -
        r'—\s*(\d+)\s*—',           # em-dash version
        r'–\s*(\d+)\s*–',           # en-dash version
        r'\-\s*(\d+)\s*\-',         # escaped dashes
        r'[\-—–]\s*(\d+)\s*[\-—–]', # any dash type
    ]

    for pattern in patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            try:
                page_num = int(match)
                if 1 <= page_num <= 100:  # Reasonable range
                    found.append({
                        'value': page_num,
                        'confidence': 90 if method_name == "NATIVE" else 80,
                        'method': method_name,
                        'pattern': f'-{match}-'
                    })
            except:
                pass

    # Pattern 2: Page at start/end of line
    line_patterns = [
        r'^\s*-\s*(\d+)\s*-\s*$',   # -2- on its own line
        r'^\s*(\d+)\s*$',           # Just number on its own line
    ]

    for pattern in line_patterns:
        matches = re.findall(pattern, text, re.MULTILINE)
        for match in matches:
            try:
                page_num = int(match)
                if 1 <= page_num <= 100:
                    found.append({
                        'value': page_num,
                        'confidence': 85 if method_name == "NATIVE" else 75,
                        'method': f"{method_name}_LINE",
                        'pattern': f'line:{match}'
                    })
            except:
                pass

    return found

def find_simple_dash_patterns(text):
    """Simple pattern matching for missed cases"""
    found = []

    # Look for any sequence that might be a page number
    # This catches cases where OCR might have mangled the dashes
    patterns = [
        r'[^\w](\d+)[^\w]',  # Number surrounded by non-word chars
        r'\b(\d+)\b',        # Word boundary numbers
    ]

    for pattern in patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            try:
                page_num = int(match)
                if 1 <= page_num <= 50 and len(match) <= 2:  # Single or double digit
                    found.append({
                        'value': page_num,
                        'confidence': 60,  # Lower confidence
                        'method': "RAW_PATTERN",
                        'pattern': match
                    })
            except:
                pass

    return found

def create_final_order(detailed_pages):
    """Create final ordering from detailed analysis"""
    print("\n" + "=" * 80)
    print("📋 FINAL ORDERING ANALYSIS:")
    print("=" * 80)

    # Extract pages with their positions
    page_mapping = {}

    for page_info in detailed_pages:
        pdf_pos = page_info['pdf_position']
        if page_info['best_page']:
            page_num = page_info['best_page']['value']
            confidence = page_info['best_page']['confidence']
            method = page_info['best_page']['method']

            page_mapping[page_num] = {
                'pdf_position': pdf_pos,
                'confidence': confidence,
                'method': method
            }

    if not page_mapping:
        print("❌ No page numbers found!")
        return None

    # Sort by page number
    sorted_pages = sorted(page_mapping.keys())
    final_order = []

    print(f"Found pages: {sorted_pages}")
    print()

    for i, page_num in enumerate(sorted_pages):
        info = page_mapping[page_num]
        pdf_pos = info['pdf_position']
        confidence = info['confidence']
        method = info['method']

        final_order.append(pdf_pos)

        print(f"Position {i+1}: Page -{page_num}- (PDF position {pdf_pos + 1})")
        print(f"   Confidence: {confidence:.1f} | Method: {method}")

    return final_order

# MAIN FUNCTION
def run_enhanced_detection(pdf_path):
    """Run enhanced detection to catch all page numbers"""
    print("🚀 STARTING ENHANCED PAGE NUMBER DETECTION...")

    detailed_results = enhanced_page_detection(pdf_path)
    final_order = create_final_order(detailed_results)

    if final_order:
        print(f"\n✅ FINAL REORDERING SEQUENCE:")
        print(f"PDF positions: {[x + 1 for x in final_order]}")
        return final_order, detailed_results
    else:
        print(f"\n❌ Could not determine page order")
        return None, detailed_results

# USAGE
pdf_path = '/content/jumbled.pdf'

try:
    order, details = run_enhanced_detection(pdf_path)
    if order:
        print(f"\n🎉 SUCCESS! Found {len(order)} ordered pages")
        print(f"Order: {[x+1 for x in order]}")

except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Complete 25-Page PDF Analyzer - Shows Every Single Page

import fitz
import easyocr
import tempfile
import os
import re
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter
import json
from collections import defaultdict

# Initialize EasyOCR
reader = easyocr.Reader(['en'], gpu=True)

class Complete25PageAnalyzer:
    def __init__(self):
        self.all_pages_data = []
        self.confidence_threshold = 75  # Lower threshold to catch more pages

    def analyze_every_single_page(self, pdf_path):
        """Analyze every single page with detailed breakdown"""
        doc = fitz.open(pdf_path)
        self.all_pages_data = []

        print(f"🔍 ANALYZING ALL {len(doc)} PAGES IN DETAIL...")
        print("=" * 100)

        for page_num in range(len(doc)):
            page = doc[page_num]

            print(f"\n{'='*20} PDF POSITION {page_num + 1} {'='*20}")

            # Get all text sources
            native_text = page.get_text()

            # Enhanced OCR
            try:
                pix = page.get_pixmap(matrix=fitz.Matrix(4, 4))
                img_path = tempfile.mktemp(suffix='.png')
                pix.save(img_path)

                # Preprocess image
                img = Image.open(img_path)
                enhancer = ImageEnhance.Contrast(img)
                img = enhancer.enhance(1.5)
                img = img.convert('L')

                enhanced_path = tempfile.mktemp(suffix='.png')
                img.save(enhanced_path)

                ocr_results = reader.readtext(enhanced_path, detail=True)
                ocr_text = ' '.join([text for _, text, conf in ocr_results if conf > 0.2])

                os.remove(img_path)
                os.remove(enhanced_path)

            except Exception as e:
                ocr_text = ""
                print(f"OCR failed: {e}")

            # Combine all text
            full_text = native_text + " " + ocr_text

            # Find ALL possible page indicators (very permissive)
            page_indicators = self._find_all_possible_indicators(full_text, page_num + 1)

            # Get meaningful text preview
            meaningful_text = self._extract_meaningful_text(full_text)

            # Store comprehensive data
            page_data = {
                'pdf_position': page_num,
                'page_indicators': page_indicators,
                'meaningful_text': meaningful_text,
                'full_text_length': len(full_text.strip()),
                'has_substantial_content': len(meaningful_text) > 50,
                'text_preview': meaningful_text[:150] + "..." if len(meaningful_text) > 150 else meaningful_text
            }

            self.all_pages_data.append(page_data)

            # Print detailed breakdown
            print(f"📄 Content Length: {len(full_text.strip())} characters")
            print(f"📊 Page Indicators Found: {len(page_indicators)}")

            if page_indicators:
                for indicator in page_indicators[:3]:  # Show top 3
                    print(f"   🎯 -{indicator['value']}- (Confidence: {indicator['confidence']:.1f}, Method: {indicator['method']})")

            print(f"📝 Content Preview:")
            print(f"   {meaningful_text[:200]}...")

            # Special analysis for specific patterns
            if self._looks_like_cover_page(meaningful_text):
                print("   🏷️  LIKELY: Cover Page")
            elif self._looks_like_toc(meaningful_text):
                print("   🏷️  LIKELY: Table of Contents")
            elif self._looks_like_signature_page(meaningful_text):
                print("   🏷️  LIKELY: Signature Page")
            elif len(meaningful_text.strip()) < 20:
                print("   🏷️  LIKELY: Blank/Nearly Blank Page")

        doc.close()
        return self.all_pages_data

    def _find_all_possible_indicators(self, text, pdf_pos):
        """Find all possible page indicators with very permissive matching"""
        indicators = []

        if not text:
            return indicators

        # Primary patterns (high confidence)
        primary_patterns = [
            (r'-\s*(\d+)\s*-', 90, "DASH_FORMAT"),
            (r'—\s*(\d+)\s*—', 90, "EM_DASH"),
            (r'–\s*(\d+)\s*–', 90, "EN_DASH"),
            (r'^\s*-\s*(\d+)\s*-\s*$', 95, "ISOLATED_DASH"),
        ]

        # Secondary patterns (medium confidence)
        secondary_patterns = [
            (r'page\s+(\d+)', 75, "PAGE_WORD"),
            (r'p\.?\s*(\d+)', 70, "P_DOT"),
            (r'(\d+)\s*of\s*\d+', 80, "X_OF_Y"),
            (r'article\s*:?\s*([IVXLCDM]+|\d+)', 70, "ARTICLE"),
        ]

        # Tertiary patterns (lower confidence, catches edge cases)
        tertiary_patterns = [
            (r'\b(\d+)\b', 50, "STANDALONE_NUMBER"),
            (r'^(\d+)$', 60, "LINE_NUMBER"),
            (r'^\s*(\d+)\s*$', 55, "ISOLATED_NUMBER"),
        ]

        all_patterns = primary_patterns + secondary_patterns + tertiary_patterns

        for pattern, base_conf, method in all_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
            for match in matches:
                try:
                    if method == "ARTICLE" and not match.isdigit():
                        # Convert Roman numerals
                        value = self._roman_to_decimal(match.upper())
                    else:
                        value = int(match)

                    # Reasonable range and avoid obvious false positives
                    if 1 <= value <= 100:
                        # Bonus confidence if number matches expected range
                        if 1 <= value <= 30:  # Typical document page range
                            confidence = base_conf * 1.1
                        else:
                            confidence = base_conf * 0.8

                        # Avoid obvious date years as page numbers
                        if value > 1900 and method == "STANDALONE_NUMBER":
                            confidence *= 0.3

                        indicators.append({
                            'value': value,
                            'confidence': min(confidence, 100),
                            'method': method,
                            'pattern_match': match,
                            'found_in_pdf_pos': pdf_pos
                        })

                except (ValueError, AttributeError):
                    continue

        # Sort by confidence, remove duplicates
        seen_values = {}
        for indicator in indicators:
            value = indicator['value']
            if value not in seen_values or indicator['confidence'] > seen_values[value]['confidence']:
                seen_values[value] = indicator

        return list(seen_values.values())

    def _roman_to_decimal(self, roman):
        """Convert Roman numerals to decimal"""
        values = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100}
        total = 0
        prev_value = 0
        for char in reversed(roman):
            value = values.get(char, 0)
            if value < prev_value:
                total -= value
            else:
                total += value
            prev_value = value
        return total

    def _extract_meaningful_text(self, text):
        """Extract meaningful text, removing excessive whitespace and noise"""
        # Remove excessive whitespace
        cleaned = re.sub(r'\s+', ' ', text.strip())

        # Remove common OCR artifacts
        cleaned = re.sub(r'[|{}]+', '', cleaned)

        # Focus on actual content
        lines = [line.strip() for line in cleaned.split('\n') if len(line.strip()) > 3]

        return ' '.join(lines)

    def _looks_like_cover_page(self, text):
        """Detect if this looks like a cover page"""
        cover_indicators = ['loan agreement', 'agreement', 'between', 'borrower', 'lender', 'dated']
        text_lower = text.lower()
        return sum(1 for indicator in cover_indicators if indicator in text_lower) >= 3

    def _looks_like_toc(self, text):
        """Detect table of contents"""
        toc_indicators = ['contents', 'article', 'section', 'schedule', 'page']
        text_lower = text.lower()
        return sum(1 for indicator in toc_indicators if indicator in text_lower) >= 2

    def _looks_like_signature_page(self, text):
        """Detect signature page"""
        sig_indicators = ['witness', 'signature', 'seal', 'signed', 'executed']
        text_lower = text.lower()
        return sum(1 for indicator in sig_indicators if indicator in text_lower) >= 2

    def create_comprehensive_order(self):
        """Create comprehensive page ordering using all available data"""
        print("\n" + "=" * 100)
        print("📋 CREATING COMPREHENSIVE PAGE ORDER FROM ALL 25 PAGES")
        print("=" * 100)

        # Collect all page numbers with their best positions
        page_position_map = {}
        confidence_map = {}

        for page_data in self.all_pages_data:
            pdf_pos = page_data['pdf_position']

            for indicator in page_data['page_indicators']:
                page_num = indicator['value']
                confidence = indicator['confidence']

                # Only accept if confidence is above threshold OR if it's the best we have for this page number
                if (confidence >= self.confidence_threshold or
                    page_num not in confidence_map or
                    confidence > confidence_map[page_num]):

                    page_position_map[page_num] = pdf_pos
                    confidence_map[page_num] = confidence

        # Create ordered sequence
        if not page_position_map:
            print("❌ No page numbers found with sufficient confidence!")
            return None, None

        sorted_page_numbers = sorted(page_position_map.keys())
        ordered_positions = [page_position_map[page_num] for page_num in sorted_page_numbers]

        print(f"📊 COMPREHENSIVE RESULTS:")
        print(f"   Total PDF pages: 25")
        print(f"   Pages with numbers found: {len(sorted_page_numbers)}")
        print(f"   Page numbers found: {sorted_page_numbers}")
        print(f"   Confidence threshold used: {self.confidence_threshold}")
        print()

        print(f"📋 FINAL ORDERING:")
        for i, page_num in enumerate(sorted_page_numbers):
            pdf_pos = page_position_map[page_num]
            confidence = confidence_map[page_num]
            print(f"   Position {i+1}: Page -{page_num}- (PDF position {pdf_pos + 1}) [Confidence: {confidence:.1f}]")

        # Show unordered pages
        ordered_pdf_positions = set(ordered_positions)
        unordered_positions = [i for i in range(25) if i not in ordered_pdf_positions]

        if unordered_positions:
            print(f"\n📄 PAGES WITHOUT CLEAR NUMBERING:")
            for pdf_pos in unordered_positions:
                page_data = self.all_pages_data[pdf_pos]
                preview = page_data['text_preview'][:100]
                print(f"   PDF position {pdf_pos + 1}: {preview}...")

        return ordered_positions, {
            'total_pages': 25,
            'numbered_pages': len(sorted_page_numbers),
            'unnumbered_pages': len(unordered_positions),
            'page_numbers_found': sorted_page_numbers,
            'confidence_threshold': self.confidence_threshold
        }

    def create_reordered_pdf(self, input_pdf_path, output_pdf_path, ordered_positions):
        """Create reordered PDF"""
        if not ordered_positions:
            print("❌ Cannot reorder - no page order available")
            return False

        print(f"\n📄 Creating reordered PDF with {len(ordered_positions)} pages...")

        doc = fitz.open(input_pdf_path)
        reordered_doc = fitz.open()

        for new_pos, orig_pdf_pos in enumerate(ordered_positions):
            reordered_doc.insert_pdf(doc, from_page=orig_pdf_pos, to_page=orig_pdf_pos)
            print(f"   Position {new_pos + 1} ← PDF page {orig_pdf_pos + 1}")

        reordered_doc.save(output_pdf_path)
        doc.close()
        reordered_doc.close()

        print(f"✅ Reordered PDF saved: {output_pdf_path}")
        return True

    def process_complete_25_pages(self, input_pdf_path):
        """Complete processing of all 25 pages"""
        print("🚀 PROCESSING ALL 25 PAGES COMPREHENSIVELY...")

        base_name = input_pdf_path.replace('.pdf', '')
        output_pdf = f"{base_name}_complete_reorder.pdf"
        report_path = f"{base_name}_25_page_analysis.json"

        try:
            # Analyze every page
            self.analyze_every_single_page(input_pdf_path)

            # Create comprehensive order
            ordered_positions, summary = self.create_comprehensive_order()

            if ordered_positions:
                # Create reordered PDF
                success = self.create_reordered_pdf(input_pdf_path, output_pdf, ordered_positions)

                # Save detailed report
                report_data = {
                    'summary': summary,
                    'all_pages_data': self.all_pages_data
                }

                with open(report_path, 'w') as f:
                    json.dump(report_data, f, indent=2)

                return {
                    'success': success,
                    'reordered_pdf': output_pdf,
                    'analysis_report': report_path,
                    'summary': summary
                }
            else:
                return {'success': False, 'error': 'Could not determine page order'}

        except Exception as e:
            print(f"❌ Error: {e}")
            import traceback
            traceback.print_exc()
            return {'success': False, 'error': str(e)}

# MAIN FUNCTION
def main():
    """Process the complete 25-page PDF"""
    input_pdf = '/content/jumbled.pdf'  # Your PDF path

    analyzer = Complete25PageAnalyzer()
    result = analyzer.process_complete_25_pages(input_pdf)

    if result['success']:
        print(f"\n🎉 SUCCESS - ALL 25 PAGES ANALYZED!")
        print(f"📁 Reordered PDF: {result['reordered_pdf']}")
        print(f"📊 Analysis Report: {result['analysis_report']}")

        summary = result['summary']
        print(f"\n📋 FINAL SUMMARY:")
        print(f"   Total pages processed: {summary['total_pages']}")
        print(f"   Pages with numbers: {summary['numbered_pages']}")
        print(f"   Pages without numbers: {summary['unnumbered_pages']}")
        print(f"   Page numbers found: {summary['page_numbers_found']}")
    else:
        print(f"❌ Failed: {result.get('error', 'Unknown error')}")

if __name__ == "__main__":
    main()

🚀 PROCESSING ALL 25 PAGES COMPREHENSIVELY...
🔍 ANALYZING ALL 25 PAGES IN DETAIL...

📄 Content Length: 246 characters
📊 Page Indicators Found: 0
📝 Content Preview:
   ARTICLE _ WV APPOINTMENT QF NOMNEE DIRECTORS The Borrower agrees that IREDA shall be entitled to appoint and withdraw from time to time nominee directors on the Board of Directors of the Borrower at a...

📄 Content Length: 0 characters
📊 Page Indicators Found: 0
📝 Content Preview:
   ...
   🏷️  LIKELY: Blank/Nearly Blank Page

📄 Content Length: 1140 characters
📊 Page Indicators Found: 0
📝 Content Preview:
   220- (vii) The Borrower agrees and undertakes to furnish certificate from the Chartered Company Secretary Company Secretary in the employment of the Borrower that no Director of the Company has been d...

📄 Content Length: 338 characters
📊 Page Indicators Found: 0
📝 Content Preview:
   LOAN AGREEMENT No. Dated (Project No. Borrower Mls_ Limited Details of Project Installation of (Project No. Loan Amount Rs. Lakhs Secur

In [None]:
# Complete PDF Analysis & Reordering System - Fixed Version (Preserves All Pages)
!pip install pymupdf easyocr pillow numpy

import fitz
import easyocr
import tempfile
import os
import re
import json
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter
from collections import defaultdict

# Initialize EasyOCR
reader = easyocr.Reader(['en'], gpu=True)

class CompletePDFProcessor:
    def __init__(self):
        self.page_data = []
        self.analysis_results = {}

    def enhanced_page_detection(self, pdf_path):
        """Enhanced detection with multiple OCR strategies"""
        doc = fitz.open(pdf_path)
        self.page_data = []

        print(f"🔍 ANALYZING {len(doc)} pages with enhanced detection...")
        print("=" * 80)

        for page_num in range(len(doc)):
            page = doc[page_num]

            print(f"\n📄 Processing PDF Position {page_num + 1}:")

            # Strategy 1: Native PDF text extraction
            native_text = page.get_text()
            native_pages = self._find_page_numbers_comprehensive(native_text, "NATIVE")

            # Strategy 2: Standard OCR
            standard_pages = []
            try:
                pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
                img_path = tempfile.mktemp(suffix='.png')
                pix.save(img_path)

                ocr_results = reader.readtext(img_path, detail=True)
                ocr_text = ' '.join([text for _, text, conf in ocr_results if conf > 0.3])
                standard_pages = self._find_page_numbers_comprehensive(ocr_text, "STANDARD_OCR")

                os.remove(img_path)
            except Exception as e:
                print(f"   ⚠️ Standard OCR failed: {e}")

            # Strategy 3: Enhanced OCR (higher resolution + preprocessing)
            enhanced_pages = []
            try:
                pix = page.get_pixmap(matrix=fitz.Matrix(6, 6))
                img_path = tempfile.mktemp(suffix='.png')
                pix.save(img_path)

                img = Image.open(img_path)
                enhancer = ImageEnhance.Contrast(img)
                img = enhancer.enhance(2.0)
                img = img.convert('L')
                img = img.filter(ImageFilter.SHARPEN)

                enhanced_path = tempfile.mktemp(suffix='.png')
                img.save(enhanced_path)

                ocr_results = reader.readtext(enhanced_path, detail=True, width_ths=0.3, height_ths=0.3)
                enhanced_text = ' '.join([text for _, text, conf in ocr_results if conf > 0.2])
                enhanced_pages = self._find_page_numbers_comprehensive(enhanced_text, "ENHANCED_OCR")

                os.remove(img_path)
                os.remove(enhanced_path)
            except Exception as e:
                print(f"   ⚠️ Enhanced OCR failed: {e}")

            # Strategy 4: Header region OCR
            region_pages = []
            try:
                pix = page.get_pixmap(matrix=fitz.Matrix(4, 4))
                img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)

                header_height = int(pix.height * 0.25)
                header_region = img_array[:header_height, :, :]

                header_img = Image.fromarray(header_region)
                header_path = tempfile.mktemp(suffix='.png')
                header_img.save(header_path)

                header_results = reader.readtext(header_path, detail=True)
                header_text = ' '.join([text for _, text, conf in header_results if conf > 0.2])
                region_pages = self._find_page_numbers_comprehensive(header_text, "HEADER_REGION")

                os.remove(header_path)
            except Exception as e:
                print(f"   ⚠️ Region OCR failed: {e}")

            # Combine all findings
            all_found_pages = native_pages + standard_pages + enhanced_pages + region_pages

            # Remove duplicates, keep highest confidence
            unique_pages = {}
            for page_info in all_found_pages:
                page_num_found = page_info['value']
                if page_num_found not in unique_pages or page_info['confidence'] > unique_pages[page_num_found]['confidence']:
                    unique_pages[page_num_found] = page_info

            final_pages = list(unique_pages.values())
            final_pages.sort(key=lambda x: x['confidence'], reverse=True)

            # Show results
            if final_pages:
                print(f"   ✅ Found: {[p['value'] for p in final_pages[:3]]}...")  # Show top 3
            else:
                print(f"   ❌ No page numbers detected")

            # Store page data
            preview_text = (native_text + " " + (ocr_text if 'ocr_text' in locals() else ""))[:200].replace('\n', ' ').strip()

            page_info = {
                'pdf_position': page_num,
                'found_pages': final_pages,
                'best_page': final_pages[0] if final_pages else None,
                'text_preview': preview_text,
                'all_text': native_text + " " + (enhanced_text if 'enhanced_text' in locals() else "")
            }

            self.page_data.append(page_info)

        doc.close()
        print(f"\n✅ Analysis complete: {len(self.page_data)} pages processed")
        return self.page_data

    def _find_page_numbers_comprehensive(self, text, method_name):
        """Find page numbers with multiple pattern variations"""
        found = []

        if not text:
            return found

        # Enhanced patterns for different dash types and spacing
        patterns = [
            (r'-\s*(\d+)\s*-', 90),           # -2- or - 2 -
            (r'—\s*(\d+)\s*—', 90),           # em-dash version
            (r'–\s*(\d+)\s*–', 90),           # en-dash version
            (r'[\-—–]\s*(\d+)\s*[\-—–]', 85), # any dash type
            (r'^\s*-\s*(\d+)\s*-\s*$', 95),   # -2- on own line
            (r'page\s+(\d+)', 80),            # page 2
            (r'p\.?\s*(\d+)', 75),            # p. 2
            (r'(\d+)\s*of\s*\d+', 85),       # 2 of 10
        ]

        confidence_multiplier = {'NATIVE': 1.0, 'ENHANCED_OCR': 0.9, 'STANDARD_OCR': 0.8, 'HEADER_REGION': 0.95}
        multiplier = confidence_multiplier.get(method_name, 0.7)

        for pattern, base_conf in patterns:
            matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
            for match in matches:
                try:
                    page_num = int(match)
                    if 1 <= page_num <= 100:  # Reasonable range
                        found.append({
                            'value': page_num,
                            'confidence': base_conf * multiplier,
                            'method': method_name,
                            'pattern': f'{pattern} -> {match}'
                        })
                except:
                    pass

        return found

    def analyze_and_create_order(self):
        """Analyze findings and create optimal page order - PRESERVE ALL PAGES"""
        print("\n" + "=" * 80)
        print("📋 CREATING OPTIMAL PAGE ORDER (PRESERVING ALL PAGES):")
        print("=" * 80)

        # Extract high-confidence page numbers
        page_mapping = {}
        confidence_threshold = 50  # Adjust based on results

        for page_info in self.page_data:
            pdf_pos = page_info['pdf_position']
            if page_info['best_page'] and page_info['best_page']['confidence'] >= confidence_threshold:
                page_num = page_info['best_page']['value']
                confidence = page_info['best_page']['confidence']
                method = page_info['best_page']['method']

                # Only keep if this is the best position for this page number
                if page_num not in page_mapping or confidence > page_mapping[page_num]['confidence']:
                    page_mapping[page_num] = {
                        'pdf_position': pdf_pos,
                        'confidence': confidence,
                        'method': method
                    }

        # CREATE FINAL ORDER INCLUDING ALL PAGES
        final_order = []
        used_positions = set()

        # STEP 1: Add numbered pages in order
        if page_mapping:
            sorted_pages = sorted(page_mapping.keys())
            print(f"Found numbered pages: {sorted_pages}")

            for page_num in sorted_pages:
                pdf_pos = page_mapping[page_num]['pdf_position']
                final_order.append(pdf_pos)
                used_positions.add(pdf_pos)

                info = page_mapping[page_num]
                print(f"Position {len(final_order)}: Page -{page_num}- (PDF pos {pdf_pos + 1}) | Conf: {info['confidence']:.1f}")

        # STEP 2: Add pages WITHOUT detected numbers at the end
        pages_without_numbers = []
        for page_info in self.page_data:
            pdf_pos = page_info['pdf_position']
            if pdf_pos not in used_positions:
                pages_without_numbers.append({
                    'pdf_position': pdf_pos,
                    'text_preview': page_info['text_preview']
                })

        print(f"\nAdding {len(pages_without_numbers)} pages without detected numbers:")
        for page in pages_without_numbers:
            pdf_pos = page['pdf_position']
            final_order.append(pdf_pos)
            preview = page['text_preview'][:50]
            print(f"Position {len(final_order)}: PDF pos {pdf_pos + 1} (No number detected) - {preview}...")

        # Analysis summary
        total_pages = len(self.page_data)
        numbered_pages = len(page_mapping)
        unnumbered_pages = len(pages_without_numbers)

        self.analysis_results = {
            'total_pdf_pages': total_pages,
            'pages_with_numbers': numbered_pages,
            'pages_without_numbers': unnumbered_pages,
            'page_range': f"{min(page_mapping.keys())}-{max(page_mapping.keys())}" if page_mapping else "None",
            'confidence_threshold': confidence_threshold,
            'final_order': final_order,
            'page_mapping': page_mapping
        }

        print(f"\n📊 ANALYSIS SUMMARY:")
        print(f"   Total PDF pages: {total_pages}")
        print(f"   Pages with numbers: {numbered_pages}")
        print(f"   Pages without numbers: {unnumbered_pages}")
        print(f"   Final order length: {len(final_order)}")
        print(f"   Page range found: {self.analysis_results['page_range']}")

        return final_order, self.analysis_results

    def create_reordered_pdf(self, input_pdf_path, output_pdf_path, page_order):
        """Create reordered PDF based on analysis"""
        if not page_order:
            print("❌ Cannot create reordered PDF - no page order available")
            return False

        print(f"\n📄 Creating reordered PDF...")
        print(f"Input: {input_pdf_path}")
        print(f"Output: {output_pdf_path}")
        print(f"Order: {[pos + 1 for pos in page_order]}")

        doc = fitz.open(input_pdf_path)
        reordered_doc = fitz.open()

        for new_pos, orig_idx in enumerate(page_order):
            reordered_doc.insert_pdf(doc, from_page=orig_idx, to_page=orig_idx)
            print(f"  Position {new_pos + 1} ← PDF page {orig_idx + 1}")

        reordered_doc.save(output_pdf_path)
        doc.close()
        reordered_doc.close()

        print(f"\n✅ Reordered PDF created successfully!")
        return True

    def save_analysis_report(self, output_path):
        """Save detailed analysis report"""
        report_data = {
            'analysis_results': self.analysis_results,
            'detailed_page_data': []
        }

        for page in self.page_data:
            page_data = {
                'pdf_position': page['pdf_position'],
                'found_pages': page['found_pages'],
                'best_page': page['best_page'],
                'text_preview': page['text_preview']
            }
            report_data['detailed_page_data'].append(page_data)

        with open(output_path, 'w') as f:
            json.dump(report_data, f, indent=2)

        print(f"📄 Analysis report saved: {output_path}")

    def process_pdf_complete(self, input_pdf_path, output_pdf_path=None, report_path=None):
        """Complete processing pipeline"""
        print("🚀 STARTING COMPLETE PDF PROCESSING...")

        # Generate default output paths
        base_name = input_pdf_path.replace('.pdf', '')
        if not output_pdf_path:
            output_pdf_path = f"{base_name}_reordered.pdf"
        if not report_path:
            report_path = f"{base_name}_analysis_report.json"

        try:
            # Step 1: Enhanced page detection
            self.enhanced_page_detection(input_pdf_path)

            # Step 2: Analyze and create order
            page_order, analysis = self.analyze_and_create_order()

            # Step 3: Create reordered PDF
            if page_order:
                success = self.create_reordered_pdf(input_pdf_path, output_pdf_path, page_order)

                # Step 4: Save analysis report
                self.save_analysis_report(report_path)

                if success:
                    return {
                        'success': True,
                        'reordered_pdf': output_pdf_path,
                        'analysis_report': report_path,
                        'pages_reordered': len(page_order),
                        'analysis_summary': analysis
                    }

            return {
                'success': False,
                'error': 'Could not determine page order',
                'analysis_report': report_path
            }

        except Exception as e:
            print(f"❌ Error during processing: {e}")
            import traceback
            traceback.print_exc()
            return {
                'success': False,
                'error': str(e)
            }

# USAGE EXAMPLE
def main():
    """Main function to process any jumbled PDF"""

    # Input PDF path - CHANGE THIS TO YOUR PDF
    input_pdf = '/content/jumbled.pdf'  # <-- Change this path

    # Initialize processor
    processor = CompletePDFProcessor()

    # Process the PDF
    result = processor.process_pdf_complete(input_pdf)

    # Show results
    if result['success']:
        print(f"\n🎉 SUCCESS!")
        print(f"📁 Reordered PDF: {result['reordered_pdf']}")
        print(f"📊 Analysis Report: {result['analysis_report']}")
        print(f"📄 Pages reordered: {result['pages_reordered']}")

        # Show summary
        summary = result['analysis_summary']
        print(f"\n📋 SUMMARY:")
        print(f"   Total pages: {summary['total_pdf_pages']}")
        print(f"   Pages with numbers: {summary['pages_with_numbers']}")
        print(f"   Page range: {summary['page_range']}")

    else:
        print(f"\n😞 FAILED: {result.get('error', 'Unknown error')}")
        if 'analysis_report' in result:
            print(f"📊 Check analysis report: {result['analysis_report']}")

if __name__ == "__main__":
    main()


🚀 STARTING COMPLETE PDF PROCESSING...
🔍 ANALYZING 25 pages with enhanced detection...

📄 Processing PDF Position 1:
   ❌ No page numbers detected

📄 Processing PDF Position 2:
   ❌ No page numbers detected

📄 Processing PDF Position 3:
   ✅ Found: [20]...

📄 Processing PDF Position 4:
   ❌ No page numbers detected

📄 Processing PDF Position 5:
   ❌ No page numbers detected

📄 Processing PDF Position 6:
   ❌ No page numbers detected

📄 Processing PDF Position 7:
   ✅ Found: [16]...

📄 Processing PDF Position 8:
   ✅ Found: [2]...

📄 Processing PDF Position 9:
   ❌ No page numbers detected

📄 Processing PDF Position 10:
   ✅ Found: [6]...

📄 Processing PDF Position 11:
   ❌ No page numbers detected

📄 Processing PDF Position 12:
   ✅ Found: [12]...

📄 Processing PDF Position 13:
   ✅ Found: [15]...

📄 Processing PDF Position 14:
   ✅ Found: [22]...

📄 Processing PDF Position 15:
   ✅ Found: [21]...

📄 Processing PDF Position 16:
   ✅ Found: [10]...

📄 Processing PDF Position 17:
   ✅ Fou

In [None]:
import fitz
import easyocr
import tempfile
import os
import re
from PIL import Image, ImageEnhance
import numpy as np

# Initialize EasyOCR
reader = easyocr.Reader(['en'], gpu=True)

class PositionAwareOCR:
    def __init__(self):
        self.reader = reader

    def extract_with_position_tolerance(self, pdf_path):
        """Extract page numbers with position tolerance for offset numbers"""
        doc = fitz.open(pdf_path)
        results = []

        for page_num in range(len(doc)):
            page = doc[page_num]

            print(f"\n📄 Processing PDF position {page_num + 1}")

            # Get page dimensions
            rect = page.rect
            page_width = rect.width
            page_height = rect.height

            # Method 1: Standard full-page OCR
            full_page_numbers = self.standard_ocr_extract(page)

            # Method 2: Region-based OCR (scan specific areas)
            region_numbers = self.region_based_ocr(page, page_width, page_height)

            # Method 3: Multiple resolution OCR
            multi_res_numbers = self.multi_resolution_ocr(page)

            # Combine all methods
            all_numbers = set()
            all_numbers.update(full_page_numbers)
            all_numbers.update(region_numbers)
            all_numbers.update(multi_res_numbers)

            # Filter reasonable page numbers
            page_numbers = [n for n in all_numbers if 1 <= n <= 50]

            print(f"   🔢 Found numbers: {sorted(page_numbers)}")

            results.append({
                'pdf_position': page_num,
                'found_numbers': sorted(page_numbers),
                'methods': {
                    'full_page': full_page_numbers,
                    'region_based': region_numbers,
                    'multi_resolution': multi_res_numbers
                }
            })

        doc.close()
        return results

    def standard_ocr_extract(self, page):
        """Standard full-page OCR"""
        try:
            pix = page.get_pixmap(matrix=fitz.Matrix(4, 4))
            img_path = tempfile.mktemp(suffix='.png')
            pix.save(img_path)

            # Enhanced preprocessing
            img = Image.open(img_path)
            enhancer = ImageEnhance.Contrast(img)
            img = enhancer.enhance(1.5)
            img = img.convert('L')

            enhanced_path = tempfile.mktemp(suffix='.png')
            img.save(enhanced_path)

            # OCR with lower confidence threshold
            ocr_results = self.reader.readtext(enhanced_path, detail=True)

            # Extract all numbers
            numbers = []
            for bbox, text, conf in ocr_results:
                # Try to find numbers in the text
                found_nums = re.findall(r'\b(\d+)\b', text)
                for num_str in found_nums:
                    if num_str.isdigit():
                        numbers.append(int(num_str))

            os.remove(img_path)
            os.remove(enhanced_path)

            return numbers

        except Exception as e:
            print(f"   ❌ Standard OCR failed: {e}")
            return []

    def region_based_ocr(self, page, page_width, page_height):
        """OCR specific regions where page numbers typically appear"""
        numbers = []

        # Define regions to scan (top/bottom margins, left/right margins)
        regions = [
            # Top margin (full width)
            fitz.Rect(0, 0, page_width, page_height * 0.1),
            # Bottom margin (full width)
            fitz.Rect(0, page_height * 0.9, page_width, page_height),
            # Left margin (full height)
            fitz.Rect(0, 0, page_width * 0.1, page_height),
            # Right margin (full height)
            fitz.Rect(page_width * 0.9, 0, page_width, page_height),
            # Center bottom (common page number location)
            fitz.Rect(page_width * 0.3, page_height * 0.85, page_width * 0.7, page_height),
            # Center top
            fitz.Rect(page_width * 0.3, 0, page_width * 0.7, page_height * 0.15)
        ]

        for i, region in enumerate(regions):
            try:
                # Extract this region as image
                pix = page.get_pixmap(matrix=fitz.Matrix(6, 6), clip=region)  # Higher resolution

                if pix.width > 10 and pix.height > 10:  # Skip tiny regions
                    img_path = tempfile.mktemp(suffix='.png')
                    pix.save(img_path)

                    # Enhanced preprocessing for small regions
                    img = Image.open(img_path)

                    # More aggressive enhancement for small regions
                    enhancer = ImageEnhance.Contrast(img)
                    img = enhancer.enhance(2.5)
                    enhancer = ImageEnhance.Sharpness(img)
                    img = enhancer.enhance(1.5)
                    img = img.convert('L')

                    enhanced_path = tempfile.mktemp(suffix='.png')
                    img.save(enhanced_path)

                    # OCR with very low confidence threshold for regions
                    ocr_results = self.reader.readtext(enhanced_path, detail=True)

                    for bbox, text, conf in ocr_results:
                        # Even accept low confidence for isolated numbers
                        found_nums = re.findall(r'\b(\d+)\b', text.strip())
                        for num_str in found_nums:
                            if num_str.isdigit() and len(num_str) <= 2:  # 1-2 digit numbers
                                numbers.append(int(num_str))
                                print(f"     🎯 Region {i} found: {num_str} (conf: {conf:.2f})")

                    os.remove(img_path)
                    os.remove(enhanced_path)

            except Exception as e:
                continue

        return list(set(numbers))  # Remove duplicates

    def multi_resolution_ocr(self, page):
        """Try OCR at multiple resolutions to catch different text sizes"""
        numbers = []

        # Try different zoom levels
        zoom_levels = [2, 3, 4, 6, 8]  # Different resolutions

        for zoom in zoom_levels:
            try:
                pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
                img_path = tempfile.mktemp(suffix='.png')
                pix.save(img_path)

                # Minimal preprocessing to preserve text
                img = Image.open(img_path)
                img = img.convert('L')

                # OCR at this resolution
                ocr_results = self.reader.readtext(img_path, detail=True)

                for bbox, text, conf in ocr_results:
                    # Look for standalone numbers or page patterns
                    patterns = [
                        r'^\s*(\d+)\s*$',  # Isolated number
                        r'-\s*(\d+)\s*-',   # -7-
                        r'page\s*(\d+)',    # page 7
                        r'(\d+)\s*$'        # Number at end of line
                    ]

                    for pattern in patterns:
                        matches = re.findall(pattern, text.strip(), re.IGNORECASE)
                        for match in matches:
                            if match.isdigit():
                                num = int(match)
                                if 1 <= num <= 50:  # Reasonable page range
                                    numbers.append(num)
                                    print(f"     🔍 Zoom {zoom}x found: {num} (pattern: {pattern[:10]}...)")

                os.remove(img_path)

            except Exception as e:
                continue

        return list(set(numbers))

    def create_comprehensive_mapping(self, results):
        """Create comprehensive page mapping from position-aware results"""
        print(f"\n{'='*60} POSITION-AWARE RESULTS {'='*60}")

        # Collect all found page numbers with their positions
        page_to_position = {}
        position_to_pages = {}

        for result in results:
            pdf_pos = result['pdf_position']
            found_numbers = result['found_numbers']

            position_to_pages[pdf_pos] = found_numbers

            for page_num in found_numbers:
                if page_num not in page_to_position:
                    page_to_position[page_num] = []
                page_to_position[page_num].append(pdf_pos)

        print("📊 COMPREHENSIVE FINDINGS:")
        print(f"   Total PDF positions scanned: {len(results)}")
        print(f"   Positions with page numbers: {len([r for r in results if r['found_numbers']])}")

        # Show what we found
        print(f"\n📋 PAGE NUMBERS FOUND:")
        for page_num in sorted(page_to_position.keys()):
            positions = page_to_position[page_num]
            print(f"   Page {page_num}: Found at PDF position(s) {positions}")

        # Show positions without clear numbers
        print(f"\n📄 PDF POSITIONS WITHOUT CLEAR PAGE NUMBERS:")
        for result in results:
            if not result['found_numbers']:
                print(f"   PDF position {result['pdf_position'] + 1}: No clear page numbers")

        # Try to create best mapping (one page number per position)
        best_mapping = {}
        for result in results:
            pdf_pos = result['pdf_position']
            found_numbers = result['found_numbers']

            if len(found_numbers) == 1:
                # Clear single match
                best_mapping[found_numbers[0]] = pdf_pos
            elif len(found_numbers) > 1:
                # Multiple matches - need heuristic
                # Prefer numbers that appear uniquely
                unique_nums = [n for n in found_numbers if len(page_to_position[n]) == 1]
                if unique_nums:
                    best_mapping[unique_nums[0]] = pdf_pos

        return best_mapping, page_to_position

# MAIN FUNCTION
def main():
    """Run position-aware analysis"""
    input_pdf = '/content/jumbled.pdf'

    analyzer = PositionAwareOCR()
    results = analyzer.extract_with_position_tolerance(input_pdf)
    best_mapping, all_findings = analyzer.create_comprehensive_mapping(results)

    print(f"\n🎯 TARGETING MISSING PAGES [7, 9, 5, 3]:")
    target_pages = [7, 9, 5, 3]

    for target in target_pages:
        if target in all_findings:
            positions = all_findings[target]
            print(f"   ✅ Page {target}: FOUND at PDF position(s) {[p+1 for p in positions]}")
        else:
            print(f"   ❌ Page {target}: Still not found")

    return results, best_mapping

if __name__ == "__main__":
    main()


📄 Processing PDF position 1
   🔢 Found numbers: []

📄 Processing PDF position 2
   🔢 Found numbers: []

📄 Processing PDF position 3
     🎯 Region 0 found: 20 (conf: 0.32)
     🎯 Region 5 found: 20 (conf: 0.81)
     🔍 Zoom 3x found: 1 (pattern: ^\s*(\d+)\...)
     🔍 Zoom 3x found: 1 (pattern: (\d+)\s*$...)
     🔍 Zoom 4x found: 20 (pattern: -\s*(\d+)\...)
     🔍 Zoom 6x found: 20 (pattern: -\s*(\d+)\...)
     🔍 Zoom 8x found: 20 (pattern: -\s*(\d+)\...)
   🔢 Found numbers: [1, 20]

📄 Processing PDF position 4
     🔍 Zoom 2x found: 3 (pattern: (\d+)\s*$...)
     🔍 Zoom 3x found: 3 (pattern: (\d+)\s*$...)
     🔍 Zoom 4x found: 3 (pattern: (\d+)\s*$...)
     🔍 Zoom 6x found: 3 (pattern: (\d+)\s*$...)
     🔍 Zoom 8x found: 3 (pattern: (\d+)\s*$...)
   🔢 Found numbers: [3]

📄 Processing PDF position 5
     🔍 Zoom 8x found: 1 (pattern: (\d+)\s*$...)
   🔢 Found numbers: [1, 3]

📄 Processing PDF position 6
     🎯 Region 0 found: 9 (conf: 0.93)
     🎯 Region 5 found: 9 (conf: 0.97)
     🔍 Zoom 

In [None]:
import fitz
import easyocr
import tempfile
import os
import re
import json
import argparse
import logging
from pathlib import Path
from PIL import Image, ImageEnhance
from collections import defaultdict, Counter
import numpy as np
import threading
from concurrent.futures import ThreadPoolExecutor
import torch

class AdaptivePDFReorderer:
    def __init__(self, gpu_optimization=True, max_workers=None):
        self.setup_logging()
        self.gpu_optimization = gpu_optimization
        self.max_workers = max_workers or min(8, os.cpu_count())

        # Initialize EasyOCR with GPU optimization
        self.reader = self._initialize_ocr()

        # Analysis results
        self.page_findings = {}
        self.confidence_scores = {}
        self.disambiguation_data = {}

        self.logger.info(f"Initialized with GPU optimization: {gpu_optimization}")
        self.logger.info(f"Max workers: {self.max_workers}")

    def setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def _initialize_ocr(self):
        try:
            if self.gpu_optimization and torch.cuda.is_available():
                # Optimize GPU memory usage
                torch.cuda.empty_cache()
                reader = easyocr.Reader(['en'], gpu=True, verbose=False)
                self.logger.info(f"GPU initialized. CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
            else:
                reader = easyocr.Reader(['en'], gpu=False, verbose=False)
                self.logger.info("Using CPU for OCR")
            return reader
        except Exception as e:
            self.logger.error(f"OCR initialization failed: {e}")
            return easyocr.Reader(['en'], gpu=False, verbose=False)

    def analyze_pdf(self, pdf_path):
        """Comprehensive PDF analysis with adaptive page detection"""
        if not Path(pdf_path).exists():
            raise FileNotFoundError(f"PDF not found: {pdf_path}")

        doc = fitz.open(pdf_path)
        total_pages = len(doc)

        self.logger.info(f"Analyzing PDF: {pdf_path}")
        self.logger.info(f"Total pages: {total_pages}")

        # Parallel processing for speed
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = []

            for page_num in range(total_pages):
                future = executor.submit(self._analyze_single_page, doc, page_num)
                futures.append((page_num, future))

            # Collect results
            for page_num, future in futures:
                try:
                    result = future.result(timeout=30)
                    if result:
                        self.page_findings[page_num] = result
                except Exception as e:
                    self.logger.error(f"Failed to analyze page {page_num + 1}: {e}")

        doc.close()

        # Clean up GPU memory
        if self.gpu_optimization:
            torch.cuda.empty_cache()

        self.logger.info(f"Analysis complete. Found page numbers on {len(self.page_findings)} pages")
        return self._create_page_mapping()

    def _analyze_single_page(self, doc, page_num):
        """Analyze single page with multiple OCR approaches"""
        page = doc[page_num]

        # Get page dimensions for region-based analysis
        rect = page.rect
        page_width, page_height = rect.width, rect.height

        all_found_numbers = set()
        detection_methods = {}

        try:
            # Method 1: Standard full-page OCR
            full_page_numbers = self._standard_ocr(page)
            all_found_numbers.update(full_page_numbers)
            if full_page_numbers:
                detection_methods['full_page'] = full_page_numbers

            # Method 2: Region-based OCR (header/footer areas)
            region_numbers = self._region_based_ocr(page, page_width, page_height)
            all_found_numbers.update(region_numbers)
            if region_numbers:
                detection_methods['regions'] = region_numbers

            # Method 3: Multi-resolution OCR
            multi_res_numbers = self._multi_resolution_ocr(page)
            all_found_numbers.update(multi_res_numbers)
            if multi_res_numbers:
                detection_methods['multi_res'] = multi_res_numbers

            # Filter to reasonable page numbers
            valid_numbers = [n for n in all_found_numbers if 1 <= n <= 200]

            if valid_numbers:
                return {
                    'found_numbers': sorted(valid_numbers),
                    'detection_methods': detection_methods,
                    'primary_candidates': self._identify_primary_candidates(valid_numbers, detection_methods)
                }

        except Exception as e:
            self.logger.debug(f"Error analyzing page {page_num + 1}: {e}")

        return None

    def _standard_ocr(self, page):
        """Standard OCR approach"""
        try:
            pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))  # Optimized resolution
            img_path = tempfile.mktemp(suffix='.png')
            pix.save(img_path)

            # Preprocess image
            img = Image.open(img_path)
            enhancer = ImageEnhance.Contrast(img)
            img = enhancer.enhance(1.3)
            img = img.convert('L')

            # OCR
            results = self.reader.readtext(img_path, detail=True)
            numbers = self._extract_numbers_from_ocr(results)

            os.remove(img_path)
            return numbers

        except Exception:
            return []

    def _region_based_ocr(self, page, page_width, page_height):
        """OCR specific regions where page numbers typically appear"""
        numbers = set()

        # Define key regions
        regions = [
            fitz.Rect(0, 0, page_width, page_height * 0.08),  # Top header
            fitz.Rect(0, page_height * 0.92, page_width, page_height),  # Bottom footer
            fitz.Rect(page_width * 0.4, page_height * 0.9, page_width * 0.6, page_height),  # Center bottom
        ]

        for region in regions:
            try:
                pix = page.get_pixmap(matrix=fitz.Matrix(4, 4), clip=region)

                if pix.width > 20 and pix.height > 10:
                    img_path = tempfile.mktemp(suffix='.png')
                    pix.save(img_path)

                    # Enhanced preprocessing for small regions
                    img = Image.open(img_path)
                    enhancer = ImageEnhance.Contrast(img)
                    img = enhancer.enhance(2.0)
                    img = img.convert('L')

                    results = self.reader.readtext(img_path, detail=True)
                    region_numbers = self._extract_numbers_from_ocr(results, min_confidence=0.3)
                    numbers.update(region_numbers)

                    os.remove(img_path)

            except Exception:
                continue

        return list(numbers)

    def _multi_resolution_ocr(self, page):
        """OCR at multiple resolutions"""
        numbers = set()
        resolutions = [2, 4, 6]  # Optimized resolution set

        for zoom in resolutions:
            try:
                pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
                img_path = tempfile.mktemp(suffix='.png')
                pix.save(img_path)

                results = self.reader.readtext(img_path, detail=True)
                res_numbers = self._extract_numbers_from_ocr(results, min_confidence=0.4)
                numbers.update(res_numbers)

                os.remove(img_path)

            except Exception:
                continue

        return list(numbers)

    def _extract_numbers_from_ocr(self, ocr_results, min_confidence=0.5):
        """Extract page numbers from OCR results using multiple patterns"""
        numbers = set()

        patterns = [
            (r'^\s*(\d+)\s*$', 1.0),          # Isolated number (high confidence)
            (r'-\s*(\d+)\s*-', 0.9),          # Dash format
            (r'page\s*(\d+)', 0.8),           # "Page N"
            (r'(\d+)\s*$', 0.7),              # Number at line end
            (r'^\s*(\d+)', 0.6),              # Number at line start
        ]

        for bbox, text, confidence in ocr_results:
            if confidence < min_confidence:
                continue

            text_clean = text.strip()

            for pattern, pattern_weight in patterns:
                matches = re.findall(pattern, text_clean, re.IGNORECASE)
                for match in matches:
                    if match.isdigit():
                        num = int(match)
                        if 1 <= num <= 200:  # Reasonable page range
                            numbers.add(num)

        return list(numbers)

    def _identify_primary_candidates(self, numbers, methods):
        """Identify the most likely page number candidates"""
        # Count detection frequency across methods
        frequency = Counter()
        for method_numbers in methods.values():
            frequency.update(method_numbers)

        # Return numbers detected by multiple methods or with high confidence
        primary = []
        for num, count in frequency.most_common():
            if count >= 2 or len([m for m in methods.values() if num in m]) >= 2:
                primary.append(num)

        return primary[:3]  # Top 3 candidates

    def _create_page_mapping(self):
        """Create intelligent page number to position mapping"""
        self.logger.info("Creating page mapping with disambiguation...")

        # Collect all page number detections
        detections = defaultdict(list)
        for pdf_pos, data in self.page_findings.items():
            for page_num in data['primary_candidates']:
                detections[page_num].append(pdf_pos)

        # Resolve conflicts using heuristics
        final_mapping = {}
        used_positions = set()

        # Sort page numbers for sequential processing
        for page_num in sorted(detections.keys()):
            positions = detections[page_num]

            # Filter out already used positions
            available_positions = [p for p in positions if p not in used_positions]

            if available_positions:
                # Choose best position using heuristics
                best_pos = self._choose_best_position(page_num, available_positions)
                final_mapping[page_num] = best_pos
                used_positions.add(best_pos)

        self.logger.info(f"Final mapping created: {len(final_mapping)} pages mapped")

        # Log mapping for verification
        for page_num in sorted(final_mapping.keys()):
            pdf_pos = final_mapping[page_num]
            self.logger.info(f"Page {page_num} -> PDF position {pdf_pos + 1}")

        return final_mapping

    def _choose_best_position(self, page_num, positions):
        """Choose the best PDF position for a page number using heuristics"""
        if len(positions) == 1:
            return positions[0]

        # Heuristic: prefer positions that have fewer competing page numbers
        position_scores = {}

        for pos in positions:
            score = 0

            # Prefer positions with fewer total detections
            total_detections = len(self.page_findings[pos]['found_numbers'])
            score += 1.0 / (total_detections + 1)

            # Prefer positions where this page number is a primary candidate
            if page_num in self.page_findings[pos]['primary_candidates']:
                score += 1.0

            # Prefer positions that are reasonable for the page number
            expected_range_start = max(0, page_num - 3)
            expected_range_end = min(len(self.page_findings), page_num + 3)
            if expected_range_start <= pos <= expected_range_end:
                score += 0.5

            position_scores[pos] = score

        return max(positions, key=lambda p: position_scores.get(p, 0))

    def create_reordered_pdf(self, input_path, output_path, page_mapping):
        """Create reordered PDF based on page mapping"""
        if not page_mapping:
            raise ValueError("No page mapping available for reordering")

        self.logger.info(f"Creating reordered PDF: {output_path}")

        doc = fitz.open(input_path)
        reordered_doc = fitz.open()

        # Sort by page numbers and reorder
        sorted_pages = sorted(page_mapping.items())

        for page_num, pdf_position in sorted_pages:
            reordered_doc.insert_pdf(doc, from_page=pdf_position, to_page=pdf_position)
            self.logger.debug(f"Added page {page_num} from PDF position {pdf_position + 1}")

        # Save reordered PDF
        reordered_doc.save(output_path)
        doc.close()
        reordered_doc.close()

        self.logger.info(f"Reordered PDF saved: {output_path}")
        self.logger.info(f"Pages reordered: {len(sorted_pages)}")

        return len(sorted_pages)

    def generate_report(self, output_path, page_mapping):
        """Generate detailed analysis report"""
        report = {
            'summary': {
                'total_pdf_pages': len(self.page_findings) if hasattr(self, 'page_findings') else 0,
                'pages_with_numbers': len([p for p in self.page_findings.values() if p['found_numbers']]),
                'successful_mappings': len(page_mapping),
                'page_range': f"{min(page_mapping.keys())}-{max(page_mapping.keys())}" if page_mapping else "None"
            },
            'page_mappings': page_mapping,
            'detailed_findings': self.page_findings
        }

        with open(output_path, 'w') as f:
            json.dump(report, f, indent=2)

        self.logger.info(f"Analysis report saved: {output_path}")

    def process_pdf(self, input_path, output_dir=None):
        """Complete PDF processing pipeline"""
        input_path = Path(input_path)

        if not input_path.exists():
            raise FileNotFoundError(f"Input PDF not found: {input_path}")

        # Setup output paths
        if output_dir is None:
            output_dir = input_path.parent
        else:
            output_dir = Path(output_dir)
            output_dir.mkdir(exist_ok=True)

        base_name = input_path.stem
        output_pdf = output_dir / f"{base_name}_reordered.pdf"
        report_path = output_dir / f"{base_name}_analysis_report.json"

        try:
            # Step 1: Analyze PDF
            self.logger.info("Step 1: Analyzing PDF structure...")
            page_mapping = self.analyze_pdf(input_path)

            if not page_mapping:
                raise ValueError("No page numbers could be detected and mapped")

            # Step 2: Create reordered PDF
            self.logger.info("Step 2: Creating reordered PDF...")
            pages_reordered = self.create_reordered_pdf(input_path, output_pdf, page_mapping)

            # Step 3: Generate report
            self.logger.info("Step 3: Generating analysis report...")
            self.generate_report(report_path, page_mapping)

            # Summary
            self.logger.info("Processing complete!")
            self.logger.info(f"Input: {input_path}")
            self.logger.info(f"Output: {output_pdf}")
            self.logger.info(f"Report: {report_path}")
            self.logger.info(f"Pages reordered: {pages_reordered}")

            return {
                'success': True,
                'input_path': str(input_path),
                'output_pdf': str(output_pdf),
                'report_path': str(report_path),
                'pages_reordered': pages_reordered,
                'page_mapping': page_mapping
            }

        except Exception as e:
            self.logger.error(f"Processing failed: {e}")
            return {
                'success': False,
                'error': str(e),
                'input_path': str(input_path)
            }

def main():
    parser = argparse.ArgumentParser(description='Adaptive PDF Page Reorderer')
    parser.add_argument('input_pdf', help='Path to input PDF file')
    parser.add_argument('-o', '--output-dir', help='Output directory (default: same as input)')
    parser.add_argument('--no-gpu', action='store_true', help='Disable GPU acceleration')
    parser.add_argument('--workers', type=int, help='Number of worker threads')
    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose logging')

    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # Initialize reorderer
    reorderer = AdaptivePDFReorderer(
        gpu_optimization=not args.no_gpu,
        max_workers=args.workers
    )

    # Process PDF
    result = reorderer.process_pdf(args.input_pdf, args.output_dir)

    if result['success']:
        print(f"SUCCESS: PDF reordered successfully")
        print(f"Output: {result['output_pdf']}")
        print(f"Pages reordered: {result['pages_reordered']}")
    else:
        print(f"FAILED: {result['error']}")
        exit(1)

if __name__ == "__main__":
    # For direct usage without command line
    if len(os.sys.argv) == 1:
        # Example usage - replace with your PDF path
        input_pdf = input("Enter PDF path: ").strip().strip('"\'')

        reorderer = AdaptivePDFReorderer(gpu_optimization=True)
        result = reorderer.process_pdf(input_pdf)

        if result['success']:
            print(f"\nSUCCESS!")
            print(f"Reordered PDF: {result['output_pdf']}")
            print(f"Analysis report: {result['report_path']}")
        else:
            print(f"FAILED: {result['error']}")
    else:
        main()

usage: colab_kernel_launcher.py [-h] [-o OUTPUT_DIR] [--no-gpu]
                                [--workers WORKERS] [-v]
                                input_pdf
colab_kernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2