In [32]:
#2Import required libraries
import fitz  # PyMuPDF
import re
import os
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass

# Configuration
PDF_PATH = "Migration Legislation Annotations.pdf"
OUTPUT_DIR = "output"
TOC_PAGES = [11, 12]  # PDF pages containing TOC
PAGE_OFFSET = 44 - 2  # Document page 1 starts at PDF page 44 (0-indexed: 42)


# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("📚 Migration Legislation TOC Generator")
print("=" * 50)
print(f"PyMuPDF version: {fitz.version[0]}")
print(f"PDF file: {PDF_PATH}")
print(f"TOC pages: {TOC_PAGES}")
print(f"Page offset: {PAGE_OFFSET + 1} (document page 1 = PDF page {PAGE_OFFSET + 1})")
print("=" * 50)


📚 Migration Legislation TOC Generator
PyMuPDF version: 1.26.3
PDF file: Migration Legislation Annotations.pdf
TOC pages: [11, 12]
Page offset: 43 (document page 1 = PDF page 43)


In [33]:
# Analyze PDF structur
print("🔍 Step 1: PDF Structure Analysis")
print("-" * 30)

# Open PDF and get basic information
doc = fitz.open(PDF_PATH)

pdf_info = {
    'filename': PDF_PATH,
    'page_count': len(doc),
    'title': doc.metadata.get('title', 'Unknown'),
    'author': doc.metadata.get('author', 'Unknown'),
    'creator': doc.metadata.get('creator', 'Unknown'),
    'file_size_mb': round(os.path.getsize(PDF_PATH) / (1024*1024), 2)
}

print("📄 PDF Information:")
for key, value in pdf_info.items():
    print(f"   {key}: {value}")

# Verify TOC pages exist
print(f"\n🔍 Verifying TOC pages {TOC_PAGES}:")
for page_num in TOC_PAGES:
    if page_num <= len(doc):
        page = doc[page_num - 1]  # Convert to 0-indexed
        text_length = len(page.get_text())
        print(f"   Page {page_num}: ✅ Exists ({text_length} characters)")
    else:
        print(f"   Page {page_num}: ❌ Does not exist!")

# Verify page offset assumption
print(f"\n🔍 Verifying page offset (document page 1 = PDF page {PAGE_OFFSET + 1}):")
if PAGE_OFFSET + 1 <= len(doc):
    offset_page = doc[PAGE_OFFSET]  # PAGE_OFFSET is already 0-indexed
    text_sample = offset_page.get_text()
    print(f"   PDF page {PAGE_OFFSET + 1} content sample:")
    print(f"   '{text_sample}...'")
    
    # Look for indicators that this is page 1 of the document
    if any(indicator in text_sample.lower() for indicator in ['Act']):
        print("   ✅ Likely contains document page 1 content")
    else:
        print("   ⚠️  May not be document page 1 - please verify")
else:
    print(f"   ❌ PDF page {PAGE_OFFSET + 1} does not exist!")

doc.close()
print("\n✅ Step 1 Complete: PDF structure verified")


🔍 Step 1: PDF Structure Analysis
------------------------------
📄 PDF Information:
   filename: Migration Legislation Annotations.pdf
   page_count: 2120
   title: Migration Legislation with Annotations
   author: Natasha Bosnjak; Ben Petrie; 
   creator: XyEnterprise XPP 8.4C.1
   file_size_mb: 7.36

🔍 Verifying TOC pages [11, 12]:
   Page 11: ✅ Exists (5493 characters)
   Page 12: ✅ Exists (3496 characters)

🔍 Verifying page offset (document page 1 = PDF page 43):
   PDF page 43 content sample:
   'Migration Act 1958
Annotated Migration Act with Related
Commentary
...'
   ⚠️  May not be document page 1 - please verify

✅ Step 1 Complete: PDF structure verified


In [34]:
# Extract TOC pages content
print("🔍 Step 2: TOC Pages Inspection")
print("-" * 30)

doc = fitz.open(PDF_PATH)

# Extract text from each TOC page
toc_raw_text = ""
for page_num in TOC_PAGES:
    print(f"\n📄 Extracting content from page {page_num}:")
    page = doc[page_num - 1]  # Convert to 0-indexed
    page_text = page.get_text()
    toc_raw_text += page_text + "\n"
    
    lines = page_text.split('\n')
    non_empty_lines = [line.strip() for line in lines if line.strip()]
    
    print(f"   Total lines: {len(lines)}")
    print(f"   Non-empty lines: {len(non_empty_lines)}")
    print(f"   Characters: {len(page_text)}")
    
    # Show first few lines as sample
    print(f"   Sample lines:")
    for i, line in enumerate(non_empty_lines[:5]):
        print(f"     {i+1}. {line}")

# Save raw TOC text for inspection
raw_toc_file = f"{OUTPUT_DIR}/raw_toc_text.txt"
with open(raw_toc_file, 'w', encoding='utf-8') as f:
    f.write(toc_raw_text)

print(f"\n💾 Raw TOC text saved to: {raw_toc_file}")

# Analyze patterns in the TOC
print(f"\n🔍 Pattern Analysis:")
lines = toc_raw_text.split('\n')
lines = [line.strip() for line in lines if line.strip()]

# Look for lines with dots and page numbers (TOC pattern) - including roman numerals
toc_pattern = re.compile(r'^(.+?)\.{3,}\s*([ivxlcdm]+|\d+)$', re.IGNORECASE)
potential_toc_lines = []

def roman_to_int(roman):
    """Convert roman numeral to integer"""
    if not roman:
        return 0
    
    roman = roman.upper()
    values = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    total = 0
    prev_value = 0
    
    for char in reversed(roman):
        value = values.get(char, 0)
        if value < prev_value:
            total -= value
        else:
            total += value
        prev_value = value
    
    return total

def is_roman_numeral(s):
    """Check if string is a roman numeral"""
    return bool(re.match(r'^[ivxlcdm]+$', s, re.IGNORECASE))

def process_multiline_toc_entries(lines):
    """Process TOC entries that may span multiple lines"""
    processed_entries = []
    i = 0
    
    def is_valid_toc_start(line):
        """Check if a line could be the start of a valid TOC entry"""
        line = line.strip()
        # Skip copyright notices, page headers, and other non-TOC content
        skip_patterns = [
            r'^©.*THOMSON REUTERS',  # Copyright lines
            r'^\d+$',                # Standalone page numbers
            r'^[ivxlcdm]+$',         # Standalone roman numerals
            r'^Migration.*Annotations$',  # Document headers
            r'^CONTENTS$',           # Standalone "CONTENTS" (we handle this manually)
            r'^\s*$'                 # Empty lines
        ]
        
        for pattern in skip_patterns:
            if re.match(pattern, line, re.IGNORECASE):
                return False
        
        # Must contain actual content (letters/words)
        if not re.search(r'[a-zA-Z]', line):
            return False
            
        return True
    
    while i < len(lines):
        line = lines[i]
        match = toc_pattern.match(line)
        
        if match:
            # Found a complete TOC entry on single line
            title = match.group(1).strip()
            page_str = match.group(2).strip()
            
            # Convert page number (roman or arabic)
            if is_roman_numeral(page_str):
                page_num = roman_to_int(page_str)
                page_type = 'roman'
            else:
                page_num = int(page_str)
                page_type = 'arabic'
            
            processed_entries.append((title, page_num, line, page_type))
            i += 1
        else:
            # Check if this might be the start of a multiline entry
            potential_title = line.strip()
            
            # Skip lines that can't be valid TOC starts
            if not is_valid_toc_start(potential_title):
                i += 1
                continue
                
            found_multiline = False
            
            # Look ahead up to 3 lines for the page number
            for j in range(i + 1, min(i + 4, len(lines))):
                next_line = lines[j].strip()
                
                # Check if this line ends with dots and page number
                multiline_match = toc_pattern.match(next_line)
                if multiline_match:
                    # Found the continuation with page number
                    continuation_title = multiline_match.group(1).strip()
                    page_str = multiline_match.group(2).strip()
                    
                    # Use only the continuation title if it exists, otherwise use potential_title
                    if continuation_title:
                        # If continuation has substantial content, use it as the main title
                        if len(continuation_title) > len(potential_title) * 0.5:
                            combined_title = continuation_title
                        else:
                            combined_title = f"{potential_title} {continuation_title}"
                    else:
                        combined_title = potential_title
                    
                    # Convert page number
                    if is_roman_numeral(page_str):
                        page_num = roman_to_int(page_str)
                        page_type = 'roman'
                    else:
                        page_num = int(page_str)
                        page_type = 'arabic'
                    
                    # Create combined line for reference
                    combined_line = f"{potential_title} {next_line}"
                    
                    processed_entries.append((combined_title, page_num, combined_line, page_type))
                    found_multiline = True
                    i = j + 1  # Skip to after the found continuation
                    break
                elif next_line and is_valid_toc_start(next_line) and not next_line.startswith(('Part', 'Schedule', 'Division', 'Section')):
                    # This might be a continuation line, add it to potential title
                    potential_title += f" {next_line}"
                else:
                    # Hit another potential TOC entry or invalid line, stop looking
                    break
            
            if not found_multiline:
                # No multiline match found, skip this line
                i += 1
    
    return processed_entries

# Process entries with multiline support
potential_toc_lines = process_multiline_toc_entries(lines)

# Add manual "Contents" entry if not found (roman page xi = 11)
contents_found = any(entry[0].lower() == 'contents' for entry in potential_toc_lines)
if not contents_found:
    print("   📝 Adding manual 'Contents' entry (page xi)")
    potential_toc_lines.insert(0, ("Contents", 11, "Contents (manual entry)", "roman"))

print(f"   Lines matching TOC pattern: {len(potential_toc_lines)}")
print(f"   Total lines in TOC pages: {len(lines)}")

# Analyze multiline entries
multiline_entries = [entry for entry in potential_toc_lines if '\n' in entry[2] or 'manual entry' in entry[2]]
print(f"   Multiline/Manual entries detected: {len(multiline_entries)}")

# Show sample matches
print(f"\n📝 Sample TOC pattern matches:")
for i, (title, page_num, full_line, page_type) in enumerate(potential_toc_lines[:10]):
    multiline_indicator = " (multiline)" if '\n' in full_line or 'manual' in full_line else ""
    print(f"   {i+1}. Title: '{title}' -> Page: {page_num} ({page_type}){multiline_indicator}")
    print(f"      Full line: '{full_line[:100]}{'...' if len(full_line) > 100 else ''}'")

doc.close()
print(f"\n✅ Step 2 Complete: Found {len(potential_toc_lines)} potential TOC entries")


🔍 Step 2: TOC Pages Inspection
------------------------------

📄 Extracting content from page 11:
   Total lines: 45
   Non-empty lines: 44
   Characters: 5493
   Sample lines:
     1. CONTENTS
     2. Foreword ...................................................................................................................................................... v
     3. Preface ........................................................................................................................................................ vii
     4. About this publication ............................................................................................................................... xiii
     5. About the authors ...................................................................................................................................... xix

📄 Extracting content from page 12:
   Total lines: 31
   Non-empty lines: 30
   Characters: 3496
   Sample lines:
     1. Schedule 4 

In [40]:
# Define TOC Entry data structure
@dataclass
class TOCEntry:
    title: str
    document_page: int  # Page number as shown in document
    pdf_page: int       # Actual PDF page (with offset applied)
    level: int          # Hierarchy level (1=Part, 2=Division, 3=Section)
    page_type: str      # 'roman' or 'arabic'

print("🔍 Step 3: TOC Entry Processing & Validation")
print("-" * 40)

# Process TOC entries
toc_entries = []
for title, doc_page, full_line, page_type in potential_toc_lines:
    # Apply page offset based on page type
    if page_type == 'roman':
        # Roman numerals start at PDF page 1 (no offset needed)
        pdf_page = doc_page
    else:
        # Arabic numerals: document page 1 starts at PDF page 43
        pdf_page = doc_page + PAGE_OFFSET
    
    # Create TOC entry
    entry = TOCEntry(
        title=title,
        document_page=doc_page,
        pdf_page=pdf_page,
        level=0,
        page_type=page_type,
    )
    toc_entries.append(entry)

print(f"📊 Processing Results:")
print(f"   Total entries processed: {len(toc_entries)}")

# Analyze entry types
type_counts = {}
level_counts = {}
for entry in toc_entries:
    level_counts[entry.level] = level_counts.get(entry.level, 0) + 1

print(f"   Hierarchy levels: {dict(sorted(level_counts.items()))}")

# Show sample processed entries
print(f"\n📝 Sample processed entries:")
for i, entry in enumerate(toc_entries[:10]):
    print(f"   {i+1}. {entry.title}")
    print(f"      Doc page: {entry.document_page} ({entry.page_type}) -> PDF page: {entry.pdf_page}")
    print(f"      Level: {entry.level}")

# Save TOC entries to JSON file for use in Step 5
import json

toc_data = {
    "metadata": {
        "total_entries": len(toc_entries),
        "extraction_date": "2024-01-01",  # You can use datetime.now().isoformat() for actual date
        "source_pages": TOC_PAGES,
        "page_offset": PAGE_OFFSET
    },
    "entries": []
}

# Convert TOC entries to dictionary format for JSON serialization
for entry in toc_entries:
    entry_dict = {
        "title": entry.title,
        "document_page": entry.document_page,
        "pdf_page": entry.pdf_page,
        "level": entry.level,
        "page_type": entry.page_type
    }
    toc_data["entries"].append(entry_dict)

# Save to JSON file
toc_json_file = f"{OUTPUT_DIR}/toc_structure.json"
with open(toc_json_file, 'w', encoding='utf-8') as f:
    json.dump(toc_data, f, indent=2, ensure_ascii=False)

print(f"\n💾 TOC structure saved to: {toc_json_file}")
print(f"   📊 Contains {len(toc_entries)} entries with metadata")
print(f"   📄 JSON file size: {round(os.path.getsize(toc_json_file) / 1024, 1)} KB")

print(f"\n✅ Step 3 Complete: {len(toc_entries)} TOC entries processed and saved to JSON")


🔍 Step 3: TOC Entry Processing & Validation
----------------------------------------
📊 Processing Results:
   Total entries processed: 65
   Hierarchy levels: {0: 65}

📝 Sample processed entries:
   1. Contents
      Doc page: 11 (roman) -> PDF page: 11
      Level: 0
   2. Foreword
      Doc page: 5 (roman) -> PDF page: 5
      Level: 0
   3. Preface
      Doc page: 7 (roman) -> PDF page: 7
      Level: 0
   4. About this publication
      Doc page: 13 (roman) -> PDF page: 13
      Level: 0
   5. About the authors
      Doc page: 19 (roman) -> PDF page: 19
      Level: 0
   6. Table of Cases
      Doc page: 21 (roman) -> PDF page: 21
      Level: 0
   7. MIGRATION ACT 1958
      Doc page: 1 (arabic) -> PDF page: 43
      Level: 0
   8. Table of Provisions
      Doc page: 3 (arabic) -> PDF page: 45
      Level: 0
   9. Table of Amending Legislation
      Doc page: 27 (arabic) -> PDF page: 69
      Level: 0
   10. Table of Annotations
      Doc page: 51 (arabic) -> PDF page: 93
      Le

In [41]:
# Validate TOC entries against actual content
print("🔍 Step 4: Content Validation")
print("-" * 30)

doc = fitz.open(PDF_PATH)

def validate_toc_entry(entry: TOCEntry, doc: fitz.Document) -> Dict[str, any]:
    """Validate a single TOC entry against actual page content"""
    result = {
        'entry': entry,
        'valid': False,
        'page_exists': False,
        'content_match': False,
        'page_content': '',
        'match_score': 0.0
    }
    
    # Check if PDF page exists
    if 0 <= entry.pdf_page - 1 < len(doc):  # Convert to 0-indexed
        result['page_exists'] = True
        page = doc[entry.pdf_page - 1]
        page_content = page.get_text()
        result['page_content'] = page_content[:300]  # First 300 chars
        
        # Simple content matching
        title_words = entry.title.lower().split()
        content_lower = page_content.lower()
        
        # Count how many title words appear in the page content
        matches = sum(1 for word in title_words if word in content_lower)
        if len(title_words) > 0:
            result['match_score'] = matches / len(title_words)
            result['content_match'] = result['match_score'] > 0.3  # 30% threshold
        
        result['valid'] = result['page_exists'] and result['content_match']
    
    return result

# Validate sample entries (first 10 for quick verification)
print("📊 Validating sample TOC entries...")
validation_results = []
sample_entries = toc_entries[:10]

for i, entry in enumerate(sample_entries):
    result = validate_toc_entry(entry, doc)
    validation_results.append(result)
    
    status = "✅" if result['valid'] else "❌"
    print(f"   {i+1}. {status} {entry.title}")
    print(f"      Page {entry.document_page} -> PDF page {entry.pdf_page}")
    print(f"      Match score: {result['match_score']:.2f}")
    if result['page_exists']:
        print(f"      Content preview: '{result['page_content'][:100]}...'")
    else:
        print(f"      ❌ Page does not exist!")

# Summary statistics
valid_count = sum(1 for r in validation_results if r['valid'])
page_exists_count = sum(1 for r in validation_results if r['page_exists'])

print(f"\n📊 Validation Summary (sample of {len(sample_entries)}):")
print(f"   Pages exist: {page_exists_count}/{len(sample_entries)} ({page_exists_count/len(sample_entries)*100:.1f}%)")
print(f"   Content matches: {valid_count}/{len(sample_entries)} ({valid_count/len(sample_entries)*100:.1f}%)")

# Check page range validity
min_pdf_page = min(entry.pdf_page for entry in toc_entries)
max_pdf_page = max(entry.pdf_page for entry in toc_entries)
pdf_page_count = len(doc)

print(f"\n📊 Page Range Analysis:")
print(f"   TOC page range: {min_pdf_page} to {max_pdf_page}")
print(f"   PDF total pages: {pdf_page_count}")
print(f"   Range validity: {'✅ Valid' if max_pdf_page <= pdf_page_count else '❌ Invalid'}")

doc.close()
print(f"\n✅ Step 4 Complete: Validation results available for analysis")


🔍 Step 4: Content Validation
------------------------------
📊 Validating sample TOC entries...
   1. ✅ Contents
      Page 11 -> PDF page 11
      Match score: 1.00
      Content preview: 'CONTENTS
Foreword .....................................................................................'
   2. ✅ Foreword
      Page 5 -> PDF page 5
      Match score: 1.00
      Content preview: 'FOREWORD
In Australia migration law has a long and complex history reﬂected in a vast array of visa
...'
   3. ✅ Preface
      Page 7 -> PDF page 7
      Match score: 1.00
      Content preview: 'PREFACE
The question of how to treat foreigners has long occupied a central place in Australia’s nat...'
   4. ✅ About this publication
      Page 13 -> PDF page 13
      Match score: 1.00
      Content preview: 'ABOUT THIS PUBLICATION
SCOPE OF THIS WORK
Migration Law – Annotated Migration Act with Related Legis...'
   5. ✅ About the authors
      Page 19 -> PDF page 19
      Match score: 1.00
      Content preview

In [42]:
# Generate TOC text for verification
print("🔍 Step 5a: TOC Text Generation")
print("-" * 35)

# Load TOC structure from JSON file created in Step 3
import json

toc_json_file = f"{OUTPUT_DIR}/toc_structure.json"
print(f"📁 Loading TOC structure from: {toc_json_file}")

try:
    with open(toc_json_file, 'r', encoding='utf-8') as f:
        toc_data = json.load(f)
    
    print(f"   ✅ Loaded {toc_data['metadata']['total_entries']} entries")
    print(f"   📄 Source pages: {toc_data['metadata']['source_pages']}")
    print(f"   ⚙️  Page offset: {toc_data['metadata']['page_offset']}")
    
    # Recreate TOC entries from JSON data
    toc_entries_from_json = []
    for entry_dict in toc_data['entries']:
        entry = TOCEntry(
            title=entry_dict['title'],
            document_page=entry_dict['document_page'],
            pdf_page=entry_dict['pdf_page'],
            level=entry_dict['level'],
            page_type=entry_dict['page_type']
        )
        toc_entries_from_json.append(entry)
    
    # Use the loaded entries for TOC generation
    toc_entries = toc_entries_from_json
    
except FileNotFoundError:
    print(f"   ❌ JSON file not found! Please run Step 3 first.")
    print(f"   🔄 Falling back to in-memory toc_entries if available...")
    if 'toc_entries' not in locals():
        raise Exception("No TOC entries available. Please run Step 3 first.")

def generate_toc_text(entries: List[TOCEntry]) -> str:
    """Generate formatted TOC text (flat structure)"""
    lines = ["TABLE OF CONTENTS", "=" * 50, ""]
    
    for entry in entries:
        # No indentation - flat structure
        title = entry.title
        
        # Format the line with dots
        dots_needed = max(3, 70 - len(title) - len(str(entry.document_page)))
        dots = "." * dots_needed
        
        formatted_line = f"{title}{dots} {entry.document_page}"
        lines.append(formatted_line)
    
    return "\n".join(lines)

# Generate and save TOC text file
print("📝 Generating TOC text file...")
toc_text = generate_toc_text(toc_entries)
toc_text_file = f"{OUTPUT_DIR}/generated_toc.txt"

with open(toc_text_file, 'w', encoding='utf-8') as f:
    f.write(toc_text)

print(f"   ✅ TOC text saved to: {toc_text_file}")
print(f"   📊 TOC contains {len(toc_entries)} entries")
print(f"   📄 Text length: {len(toc_text)} characters")

# Preview the generated TOC
print(f"\n📝 TOC Preview (first 10 lines):")
toc_lines = toc_text.split('\n')
for i, line in enumerate(toc_lines[:13]):  # Show header + 10 entries
    print(f"   {line}")
if len(toc_lines) > 13:
    print(f"   ... ({len(toc_lines) - 13} more lines)")

print(f"\n✅ Step 5a Complete: TOC text generated and ready for verification")
print(f"   📁 Review the TOC file: {toc_text_file}")
print(f"   ▶️  If TOC looks correct, proceed to Step 5b to create the enhanced PDF")


🔍 Step 5a: TOC Text Generation
-----------------------------------
📁 Loading TOC structure from: output/toc_structure.json
   ✅ Loaded 65 entries
   📄 Source pages: [11, 12]
   ⚙️  Page offset: 42
📝 Generating TOC text file...
   ✅ TOC text saved to: output/generated_toc.txt
   📊 TOC contains 65 entries
   📄 Text length: 5047 characters

📝 TOC Preview (first 10 lines):
   TABLE OF CONTENTS
   
   Contents............................................................ 11
   Foreword............................................................. 5
   Preface.............................................................. 7
   About this publication.............................................. 13
   About the authors................................................... 19
   Table of Cases...................................................... 21
   MIGRATION ACT 1958................................................... 1
   Table of Provisions.................................................. 3
   

In [43]:
# Apply TOC to PDF - Create enhanced PDF with bookmarks
print("🔍 Step 5b: Apply TOC to PDF")
print("-" * 30)

def create_enhanced_pdf(original_path: str, toc_text: str, toc_entries: List[TOCEntry], output_path: str):
    """Create enhanced PDF with TOC page and bookmarks"""
    print("📄 Creating enhanced PDF...")
    
    # Open original PDF
    original_doc = fitz.open(original_path)
    
    # Create new document
    enhanced_doc = fitz.open()
    
    # Create TOC page using pre-generated text
    toc_page = enhanced_doc.new_page(width=595, height=842)  # A4 size
    
    # Insert TOC text
    text_rect = fitz.Rect(50, 50, 545, 792)  # Margins
    toc_page.insert_textbox(text_rect, toc_text, fontsize=10, align=0)
    
    print(f"   ✅ TOC page created with {len(toc_entries)} entries")
    
    # Copy all pages from original document
    enhanced_doc.insert_pdf(original_doc)
    
    print(f"   ✅ Copied {len(original_doc)} pages from original")
    
    # Add bookmarks using the TOC entries
    print(f"   📑 Adding bookmarks...")
    toc_outline = []
    for entry in toc_entries:
        # Create bookmark entry: (level, title, page_number)
        toc_outline.append((1, entry.title, entry.pdf_page + 1))  # +1 because we added TOC page
    
    try:
        enhanced_doc.set_toc(toc_outline)
        print(f"   ✅ Created {len(toc_outline)} bookmarks")
    except Exception as e:
        print(f"   ⚠️  Bookmark creation failed: {str(e)}")
    
    # Save enhanced PDF
    enhanced_doc.save(output_path)
    enhanced_doc.close()
    original_doc.close()
    
    return output_path

# Generate the enhanced PDF using the pre-generated TOC text
output_pdf_path = f"{OUTPUT_DIR}/Migration_Legislation_with_TOC.pdf"
enhanced_pdf = create_enhanced_pdf(PDF_PATH, toc_text, toc_entries, output_pdf_path)

# Verify the output
if os.path.exists(enhanced_pdf):
    file_size_mb = round(os.path.getsize(enhanced_pdf) / (1024*1024), 2)
    original_size_mb = round(os.path.getsize(PDF_PATH) / (1024*1024), 2)
    
    print(f"\n📊 Enhanced PDF Results:")
    print(f"   Output file: {enhanced_pdf}")
    print(f"   Original size: {original_size_mb} MB")
    print(f"   Enhanced size: {file_size_mb} MB")
    print(f"   Size change: {file_size_mb - original_size_mb:+.2f} MB")
    
    # Quick verification
    test_doc = fitz.open(enhanced_pdf)
    print(f"   Total pages: {len(test_doc)} (original: {len(fitz.open(PDF_PATH))})")
    print(f"   TOC entries: {len(toc_entries)}")
    test_doc.close()
    
    print(f"\n✅ Enhanced PDF created successfully!")
    print(f"   📁 Enhanced PDF location: {enhanced_pdf}")
    print(f"   🔖 {len(toc_entries)} bookmarks added for navigation")
else:
    print(f"\n❌ Failed to create enhanced PDF")

print(f"\n✅ Step 5b Complete: Enhanced PDF generation finished")


🔍 Step 5b: Apply TOC to PDF
------------------------------
📄 Creating enhanced PDF...
   ✅ TOC page created with 65 entries
   ✅ Copied 2120 pages from original
   📑 Adding bookmarks...
   ✅ Created 65 bookmarks

📊 Enhanced PDF Results:
   Output file: output/Migration_Legislation_with_TOC.pdf
   Original size: 7.36 MB
   Enhanced size: 7.37 MB
   Size change: +0.01 MB
   Total pages: 2121 (original: 2120)
   TOC entries: 65

✅ Enhanced PDF created successfully!
   📁 Enhanced PDF location: output/Migration_Legislation_with_TOC.pdf
   🔖 65 bookmarks added for navigation

✅ Step 5b Complete: Enhanced PDF generation finished
