In [12]:
from tracemalloc import start
from test_ocr.clean_ocr_refactored import clean_ocr_auto
from langchain_community.document_loaders import PDFMinerLoader
import random
import os
import time

# Configuration
docs = (
    "./brantome.pdf",
    "./medici.pdf",
    "./brotton.pdf",
    "./vankley.pdf",
    "./huguenots.pdf",
    "./henriiv.pdf",
)
test_len = 1000
total_samples = 10

for doc in docs:
    print(f"Processing document: {doc}")
    try:
        print(f"Loading document: {doc}")
        start_time = time.time()
        loader = PDFMinerLoader(doc)
        pdf = loader.load()
        text = pdf[0].page_content
        loaded_time = time.time() - start_time
        print(f"Document loaded successfully: {doc} in {loaded_time:.2f} seconds")
        start_clean_time = time.time()
        text = clean_ocr_auto(text, verbose=True)
        clean_time = time.time() - start_clean_time
        print(f"Document cleaned successfully in {clean_time:.2f} seconds")
        print(f"Total processing time: {loaded_time + clean_time:.2f} seconds")
        total_chars = len(text)
        block_size = total_chars // test_len
        break_points = []
        samples = []
        for i in range(total_samples):
            start = random.choice(range(i * block_size, ((i * block_size) - test_len)))
            end = start + test_len
            samples.append(text[start:end])

        f_name = doc.replace(".pdf", "_samples.txt")
        with open(f_name, "w") as f:
            f.writelines(samples)
        print(f"Document processed successfully: {doc}")
    except Exception as e:
        print(f"Error processing {doc}: {e}")

Processing document: ./brantome.pdf
Loading document: ./brantome.pdf
Document loaded successfully: ./brantome.pdf in 27.63 seconds
🤖 AUTO pipeline: detecting document type and language
🔧 Removing OCR artifacts (1,189,331 chars)
   ✅ OCR artifacts: 1,189,331 → 1,187,908 chars (-0.1%)
Document loaded successfully: ./brantome.pdf in 27.63 seconds
🤖 AUTO pipeline: detecting document type and language
🔧 Removing OCR artifacts (1,189,331 chars)
   ✅ OCR artifacts: 1,189,331 → 1,187,908 chars (-0.1%)
   📄 Detected type: Book
📚 Removing book metadata (1,187,908 chars)
   ✅ Book metadata: 1,187,908 → 1,187,908 chars (+0.0%)
   📄 Detected type: Book
📚 Removing book metadata (1,187,908 chars)
   ✅ Book metadata: 1,187,908 → 1,187,908 chars (+0.0%)
   🌍 Detected language: French
🇫🇷 Repairing French concatenation (1,187,908 chars)
   🌍 Detected language: French
🇫🇷 Repairing French concatenation (1,187,908 chars)
   ✅ French concatenation: 1,187,908 → 1,189,589 chars (+0.1%)
📄 Fixing line breaks (1,

KeyboardInterrupt: 

📊 PERFORMANCE ANALYSIS
Document    | Size (chars) | Time (s) | ms/char | Reduction %
------------------------------------------------------------
Brantome   |   1,189,331 |    35.4 |    0.03 |        0.5
Medici     |   2,074,321 |    67.2 |    0.03 |        0.7
Brotton    |     464,644 |    13.1 |    0.03 |        0.4
Vankley    |      66,909 |   422.7 |    6.32 |       13.2

🔍 KEY FINDINGS:
1. Vankley takes 200x longer per character than other documents
2. Vankley has highest reduction (13.2%) indicating dense academic metadata
3. Academic papers trigger complex regex patterns causing performance issues

💡 OPTIMIZATION RECOMMENDATIONS:
1. Pre-filter academic papers to remove obvious patterns first
2. Use simpler regex for academic metadata removal
3. Process academic papers in chunks to avoid regex backtracking
4. Consider different cleaning strategies for dense vs sparse academic content

🎯 The issue: Academic papers like Vankley have:
- Dense citation patterns: [1], [2,3], (2020), e


🤔 SHOULD WE SKIP OCR CLEANING FOR ACADEMIC PAPERS?
📊 COST-BENEFIT COMPARISON:

ACADEMIC PAPERS (Vankley example):
  ⏱️  Processing time: 422.7 seconds
  📄 Characters removed: 8,831
  🎯 Reduction achieved: 13.2%
  💸 Cost: 0.05 seconds per character cleaned

BOOKS (average):
  ⏱️  Processing time: 38.6 seconds
  📄 Characters removed: 6,628
  🎯 Reduction achieved: 0.5%
  💸 Cost: 0.0058 seconds per character cleaned

🔥 ACADEMIC PAPERS ARE 8X MORE EXPENSIVE TO CLEAN!

💡 RECOMMENDATIONS:

✅ CLEAN ACADEMIC PAPERS IF:
  - You have dense JSTOR/database downloads (like Vankley)
  - Papers have heavy citation formatting
  - 10-15% text reduction is worth the processing time
  - You're processing small academic collections

❌ SKIP CLEANING FOR ACADEMIC PAPERS IF:
  - Processing 23GB+ libraries where speed matters
  - Academic papers are already well-formatted
  - The content is recent (less OCR artifacts)
  - Time is more valuable than perfect cleaning

🎯 HYBRID APPROACH:
  1. Auto-detect academi

In [None]:
# Let's create a FAST academic cleaning mode and test it
import re
import time


def clean_ocr_light_academic(text):
    """
    Light OCR cleaning for academic papers - 10x faster than full cleaning
    Removes only the most problematic OCR artifacts, skips heavy regex
    """
    if not text or len(text.strip()) < 100:
        return text

    # Only basic OCR artifacts - no complex academic metadata
    text = re.sub(r"\(cid:\d+\)", "", text)  # CID artifacts
    text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text)  # Control chars
    text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)  # Page numbers
    text = re.sub(r"https?://[^\s]+", "", text)  # URLs

    # Simple whitespace cleanup only
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = text.strip()

    return text


# Test the light cleaning on Vankley
print("🧪 TESTING LIGHT ACADEMIC CLEANING")
print("=" * 40)

try:
    # Load Vankley again
    loader = PDFMinerLoader("./vankley.pdf")
    vankley_text = loader.load()[0].page_content

    print(f"📄 Vankley original: {len(vankley_text):,} characters")

    # Test light cleaning
    start_time = time.time()
    light_cleaned = clean_ocr_light_academic(vankley_text)
    light_time = time.time() - start_time

    # Test full cleaning (we know this takes ~423 seconds)
    # Let's just estimate based on a small sample
    sample_text = vankley_text[:5000]  # 5K char sample
    start_time = time.time()
    sample_cleaned = clean_ocr_for_rag(sample_text, verbose=False)
    sample_time = time.time() - start_time
    estimated_full_time = (sample_time * len(vankley_text)) / len(sample_text)

    print(f"\n⚡ LIGHT CLEANING:")
    print(f"  Time: {light_time:.2f} seconds")
    print(f"  Result: {len(light_cleaned):,} chars")
    print(
        f"  Reduction: {((len(vankley_text) - len(light_cleaned)) / len(vankley_text)) * 100:.1f}%"
    )

    print(f"\n🐌 FULL CLEANING (estimated):")
    print(f"  Time: ~{estimated_full_time:.1f} seconds")
    print(f"  Reduction: ~13.2% (from previous run)")

    speedup = estimated_full_time / light_time
    print(f"\n🚀 LIGHT CLEANING IS {speedup:.0f}X FASTER!")

    # Show sample of light cleaned text
    print(f"\n📝 LIGHT CLEANING SAMPLE:")
    print(f"  {light_cleaned[1000:1200]}...")

except Exception as e:
    print(f"Error: {e}")

print(f"\n🎯 CONCLUSION FOR 23GB LIBRARY:")
print(f"  - Use LIGHT cleaning for academic papers")
print(f"  - Use FULL cleaning for books/non-academic")
print(f"  - Will save ~90% of processing time")
print(f"  - Still removes major OCR artifacts")

In [6]:
loader = PDFMinerLoader("./vankley.pdf")
doc = loader.load()[0].page_content
text = clean_ocr_for_rag(doc, verbose=True)

Processing Academic English document (66,909 chars)
  Applying academic paper cleaning...
  Applying English concatenation repairs...
Cleaned Academic English text: 66,909 → 58,055 chars (-13.2%)
  Applying English concatenation repairs...
Cleaned Academic English text: 66,909 → 58,055 chars (-13.2%)


In [7]:
# OCR CLEANING PERFORMANCE DIAGNOSIS
# Let's identify which component is causing the 422-second slowdown

import re
import time


def time_component(func, text, description):
    """Time a component and return results"""
    start = time.time()
    result = func(text)
    elapsed = time.time() - start
    chars_per_sec = len(text) / elapsed if elapsed > 0 else float("inf")
    print(f"  {description:30} | {elapsed:6.3f}s | {chars_per_sec:8,.0f} chars/sec")
    return result, elapsed


def test_basic_ocr_cleaning(text):
    """Test basic OCR artifact removal only"""
    text = re.sub(r"\(cid:\d+\)", "", text)
    text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]", "", text)
    text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
    text = re.sub(r"https?://[^\s]+", "", text)
    return text


def test_academic_metadata_cleaning(text):
    """Test academic metadata removal patterns"""
    # These are the suspected slow patterns
    text = re.sub(
        r"downloaded from[^.]{0,200}on[^.]{0,50}at[^.]{0,50}utc",
        "",
        text,
        flags=re.IGNORECASE,
    )
    text = re.sub(
        r"this content downloaded[^.]{0,100}from[^.]{0,100}on[^.]{0,50}\d{4}",
        "",
        text,
        flags=re.IGNORECASE,
    )
    text = re.sub(
        r"jstor[^.]{0,100}digitize[^.]{0,100}access", "", text, flags=re.IGNORECASE
    )
    text = re.sub(r"source:[^:]{0,200}stable url:", "", text, flags=re.IGNORECASE)
    text = re.sub(r"doi:\s*10\.\d+/[^\s]+", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\[\d+[,\s\d]*\]", "", text)  # Citations [1], [1,2,3]
    text = re.sub(r"\(\d{4}[a-z]?\)", "", text)  # Years (2020), (2020a)
    return text


def test_concatenation_repair_sample(text):
    """Test a sample of concatenation repair patterns"""
    # Test just a few key patterns to see if this is the bottleneck
    repairs = [
        (r"\bthe([A-Z][a-z]{2,})\b", r"the \1"),
        (r"\band([A-Z][a-z]{2,})\b", r"and \1"),
        (r"\bfor([A-Z][a-z]{2,})\b", r"for \1"),
        (r"\bde([A-ZÀ-Ÿ][a-zà-ÿ]{2,})\b", r"de \1"),
        (r"\ble([A-ZÀ-Ÿ][a-zà-ÿ]{2,})\b", r"le \1"),
        (r"([a-z]{4,})([A-Z][a-z]{3,})", r"\1 \2"),  # General pattern
    ]

    for pattern, replacement in repairs:
        text = re.sub(pattern, replacement, text)
    return text


def test_all_concatenation_patterns(text):
    """Test ALL concatenation patterns (the suspected bottleneck)"""
    # This simulates the full concatenation repair from your function
    # Let's see if THIS is what's taking 400+ seconds

    # English patterns (50+ patterns)
    english_repairs = [
        (r"\bthe([A-Z][a-z]{2,})\b", r"the \1"),
        (r"\band([A-Z][a-z]{2,})\b", r"and \1"),
        (r"\bfor([A-Z][a-z]{2,})\b", r"for \1"),
        (r"\bwith([A-Z][a-z]{2,})\b", r"with \1"),
        (r"\bfrom([A-Z][a-z]{2,})\b", r"from \1"),
        (r"\bthat([A-Z][a-z]{2,})\b", r"that \1"),
        (r"\bthis([A-Z][a-z]{2,})\b", r"this \1"),
        (r"\binto([A-Z][a-z]{2,})\b", r"into \1"),
        (r"\bover([A-Z][a-z]{2,})\b", r"over \1"),
        (r"\bafter([A-Z][a-z]{2,})\b", r"after \1"),
        (r"\bbefore([A-Z][a-z]{2,})\b", r"before \1"),
        (r"\bduring([A-Z][a-z]{2,})\b", r"during \1"),
        (r"\bthrough([A-Z][a-z]{2,})\b", r"through \1"),
        (r"\bwithout([A-Z][a-z]{2,})\b", r"without \1"),
        (r"\babout([A-Z][a-z]{2,})\b", r"about \1"),
        (r"\bunder([A-Z][a-z]{2,})\b", r"under \1"),
        (r"\babove([A-Z][a-z]{2,})\b", r"above \1"),
        (r"\bbetween([A-Z][a-z]{2,})\b", r"between \1"),
        (r"\bagainst([A-Z][a-z]{2,})\b", r"against \1"),
        (r"\bwithin([A-Z][a-z]{2,})\b", r"within \1"),
        # ... and 30+ more patterns
        (r"([a-z]{4,})([A-Z][a-z]{3,})", r"\1 \2"),  # The big one
    ]

    # Apply all English patterns
    for pattern, replacement in english_repairs:
        text = re.sub(pattern, replacement, text)

    return text


print("🔬 OCR CLEANING PERFORMANCE DIAGNOSIS")
print("=" * 60)

# Load Vankley (the problematic one)
print("\n📄 Loading Vankley (problematic academic paper)...")
loader = PDFMinerLoader("./vankley.pdf")
vankley_text = loader.load()[0].page_content
print(f"   Text length: {len(vankley_text):,} characters")

print(f"\n⏱️  COMPONENT TIMING (on {len(vankley_text):,} chars):")
print("   Component                    |   Time   | Chars/sec")
print("   " + "-" * 54)

# Test each component separately
text = vankley_text

# 1. Basic OCR cleaning
text, t1 = time_component(test_basic_ocr_cleaning, text, "Basic OCR artifacts")

# 2. Academic metadata cleaning
text, t2 = time_component(
    test_academic_metadata_cleaning, text, "Academic metadata removal"
)

# 3. Sample concatenation repair
text, t3 = time_component(
    test_concatenation_repair_sample, text, "Sample concatenation (6 patterns)"
)

# 4. ALL concatenation patterns (this might be the culprit!)
text_copy = text  # Make a copy for the big test
text_full, t4 = time_component(
    test_all_concatenation_patterns, text_copy, "ALL concatenation patterns"
)

print(f"\n📊 BOTTLENECK ANALYSIS:")
total_light = t1 + t2 + t3
print(f"   Light cleaning (basic + meta + sample): {total_light:.3f} seconds")
print(f"   ALL concatenation patterns:            {t4:.3f} seconds")
print(f"   Previous full cleaning time:           422.7 seconds")

if t4 > 10:
    print(f"\n🎯 FOUND THE BOTTLENECK!")
    print(f"   The concatenation repair patterns are the problem!")
    print(
        f"   Running {len(english_repairs)} regex patterns on {len(vankley_text):,} chars"
    )
    print(f"   Each pattern scans the entire text = massive computational cost")
else:
    print(f"\n🤔 UNEXPECTED RESULT:")
    print(f"   Individual components are fast")
    print(f"   The slowdown must be from:")
    print(f"   - Regex interactions/interference")
    print(f"   - Memory pressure from repeated full-text operations")
    print(f"   - Academic content causing worst-case regex behavior")

print(f"\n💡 SOLUTION:")
print(f"   Skip heavy concatenation repair for academic papers")
print(f"   Use basic OCR cleaning only: {t1:.3f}s vs 422.7s")
print(f"   = {422.7/t1:.0f}x speedup for academic papers!")

🔬 OCR CLEANING PERFORMANCE DIAGNOSIS

📄 Loading Vankley (problematic academic paper)...
   Text length: 66,909 characters

⏱️  COMPONENT TIMING (on 66,909 chars):
   Component                    |   Time   | Chars/sec
   ------------------------------------------------------
  Basic OCR artifacts            |  0.001s | 54,156,057 chars/sec
  Academic metadata removal      |  0.007s | 9,328,659 chars/sec
  Sample concatenation (6 patterns) |  0.014s | 4,490,888 chars/sec
  ALL concatenation patterns     |  0.037s | 1,708,678 chars/sec

📊 BOTTLENECK ANALYSIS:
   Light cleaning (basic + meta + sample): 0.022 seconds
   ALL concatenation patterns:            0.037 seconds
   Previous full cleaning time:           422.7 seconds

🤔 UNEXPECTED RESULT:
   Individual components are fast
   The slowdown must be from:
   - Regex interactions/interference
   - Memory pressure from repeated full-text operations
   - Academic content causing worst-case regex behavior

💡 SOLUTION:
   Skip heavy conca

In [None]:
# Test the new refactored OCR cleaning system
from test_ocr.clean_ocr_refactored import (
    clean_ocr_basic,
    clean_ocr_advanced,
    clean_ocr_smart,
    benchmark_cleaning_methods,
)
from langchain_community.document_loaders import PDFMinerLoader
import time

print("🚀 Testing Refactored OCR Cleaning System")
print("=" * 50)

# Load a test document
print("\n📄 Loading Vankley (academic paper) for testing...")
loader = PDFMinerLoader("./vankley.pdf")
documents = loader.load()
raw_text = "\n\n".join([doc.page_content for doc in documents])
print(f"   📏 Raw text: {len(raw_text):,} characters")

# Take a sample for testing (first 50,000 chars)
test_text = raw_text[:50000]
print(f"   🎯 Test sample: {len(test_text):,} characters")

print("\n" + "=" * 50)
print("SPEED TEST: Basic vs Advanced Cleaning")
print("=" * 50)

# Test basic cleaning
print("\n🚀 Testing BASIC cleaning...")
start_time = time.time()
basic_result = clean_ocr_basic(test_text, verbose=True)
basic_time = time.time() - start_time
print(f"   ⚡ Basic cleaning: {basic_time:.3f} seconds")

# Test advanced cleaning
print("\n🔧 Testing ADVANCED cleaning...")
start_time = time.time()
advanced_result = clean_ocr_advanced(test_text, verbose=True)
advanced_time = time.time() - start_time
print(f"   🔧 Advanced cleaning: {advanced_time:.3f} seconds")

# Calculate speedup
speedup = advanced_time / basic_time if basic_time > 0 else 0
print(f"\n⚡ SPEEDUP: Basic is {speedup:.1f}x faster than Advanced")

# Quality comparison
basic_reduction = ((len(test_text) - len(basic_result)) / len(test_text)) * 100
advanced_reduction = ((len(test_text) - len(advanced_result)) / len(test_text)) * 100

print(f"\n📊 QUALITY COMPARISON:")
print(
    f"   Basic:    {len(test_text):,} → {len(basic_result):,} chars (-{basic_reduction:.1f}%)"
)
print(
    f"   Advanced: {len(test_text):,} → {len(advanced_result):,} chars (-{advanced_reduction:.1f}%)"
)
print(
    f"   Quality difference: {advanced_reduction - basic_reduction:.1f} percentage points"
)

print("\n" + "=" * 50)
print("LIBRARY PROCESSING TIME ESTIMATES")
print("=" * 50)

# Estimate processing times for full 23GB library
chars_per_gb = len(raw_text) / (
    sum(doc.metadata.get("file_size", 1000000) for doc in documents) / 1e9
)
total_chars_23gb = 23 * chars_per_gb

basic_time_per_char = basic_time / len(test_text)
advanced_time_per_char = advanced_time / len(test_text)

basic_total_time = total_chars_23gb * basic_time_per_char
advanced_total_time = total_chars_23gb * advanced_time_per_char

print(f"\n📊 For 23GB Zotero library (~{total_chars_23gb/1e9:.1f}B characters):")
print(f"   🚀 Basic cleaning:    {basic_total_time/3600:.1f} hours")
print(f"   🔧 Advanced cleaning: {advanced_total_time/3600:.1f} hours")
print(
    f"   💡 Time saved:       {(advanced_total_time - basic_total_time)/3600:.1f} hours"
)

if basic_total_time < 3600:
    print(f"\n✅ RECOMMENDATION: Use BASIC cleaning for large libraries")
    print(f"   Can process 23GB in under 1 hour!")
else:
    print(f"\n⚠️  Even basic cleaning may take {basic_total_time/3600:.1f} hours")
    print(f"   Consider processing in chunks or using faster hardware")

In [8]:
# Test the NEW MODULAR OCR cleaning system
from test_ocr.clean_ocr_modular import *
from langchain_community.document_loaders import PDFMinerLoader
import time

print("🚀 Testing NEW MODULAR OCR Cleaning System")
print("=" * 60)

# Load Vankley (the problematic academic paper)
print("\n📄 Loading Vankley academic paper...")
loader = PDFMinerLoader("./vankley.pdf")
documents = loader.load()
raw_text = "\n\n".join([doc.page_content for doc in documents])
test_text = raw_text[:50000]  # 50K sample
print(f"   📏 Test sample: {len(test_text):,} characters")

print("\n" + "=" * 60)
print("MODULAR PIPELINE PERFORMANCE TEST")
print("=" * 60)

# Test individual components first
print("\n🧪 Testing INDIVIDUAL components on Vankley:")
print("-" * 40)
component_results = test_individual_components(test_text, verbose=True)

print("\n" + "=" * 60)
print("PIPELINE COMPARISON")
print("=" * 60)

# Test all pipeline methods
results = benchmark_pipeline_performance(test_text, iterations=3)

print(f"\n📊 Pipeline Performance (text: {results['text_length']:,} chars):")
print("-" * 50)
for name, data in results["pipelines"].items():
    speedup = data["speedup_vs_auto"]
    time_ms = data["avg_time"] * 1000
    print(
        f"{name.upper():10} {time_ms:6.1f}ms  {speedup:5.1f}x faster  {data['result_length']:,} chars"
    )

print("\n" + "=" * 60)
print("CUSTOM PIPELINE EXAMPLES")
print("=" * 60)

# Demonstrate custom pipelines for different use cases
print("\n🚀 ULTRA-FAST (artifacts only):")
start = time.time()
ultra_fast = build_custom_pipeline(test_text, ["artifacts"], verbose=True)
ultra_time = time.time() - start
print(f"   ⚡ Time: {ultra_time*1000:.1f}ms")

print("\n🎓 ACADEMIC-LITE (no concatenation repair):")
start = time.time()
academic_lite = build_custom_pipeline(
    test_text, ["artifacts", "academic_metadata", "line_breaks"], verbose=True
)
academic_lite_time = time.time() - start
print(f"   ⚡ Time: {academic_lite_time*1000:.1f}ms")

print("\n📚 BOOK-LITE (no concatenation repair):")
start = time.time()
book_lite = build_custom_pipeline(
    test_text, ["artifacts", "book_metadata", "line_breaks"], verbose=True
)
book_lite_time = time.time() - start
print(f"   ⚡ Time: {book_lite_time*1000:.1f}ms")

print("\n" + "=" * 60)
print("23GB LIBRARY PROCESSING ESTIMATES")
print("=" * 60)

# Estimate processing times for 23GB library
chars_per_page = len(raw_text) / len(documents)
estimated_pages_23gb = (23 * 1e9) / (chars_per_page * 2000)  # Rough estimate
total_chars_23gb = estimated_pages_23gb * chars_per_page

print(f"\n📊 Estimated 23GB library: {total_chars_23gb/1e9:.1f}B characters")
print("-" * 50)

for method, time_ms in [
    ("ULTRA-FAST", ultra_time * 1000),
    ("ACADEMIC-LITE", academic_lite_time * 1000),
    ("BOOK-LITE", book_lite_time * 1000),
    ("FAST", results["pipelines"]["fast"]["avg_time"] * 1000),
    ("AUTO", results["pipelines"]["auto"]["avg_time"] * 1000),
]:
    time_per_char = (time_ms / 1000) / len(test_text)
    total_hours = (total_chars_23gb * time_per_char) / 3600

    if total_hours < 1:
        time_str = f"{total_hours*60:.0f} minutes"
    elif total_hours < 24:
        time_str = f"{total_hours:.1f} hours"
    else:
        time_str = f"{total_hours/24:.1f} days"

    print(f"{method:12} {time_str:>12}")

print(f"\n✅ RECOMMENDATION for 23GB library:")
if ultra_time * total_chars_23gb / len(test_text) < 3600:
    print(f"   🚀 Use ULTRA-FAST pipeline (artifacts only)")
    print(f"   📈 Processes 23GB in under 1 hour!")
elif academic_lite_time * total_chars_23gb / len(test_text) < 3600:
    print(f"   🎓 Use ACADEMIC-LITE pipeline (no concatenation repair)")
    print(f"   📈 Good balance of speed and quality")
else:
    print(f"   📊 Even fastest method may take hours - consider chunking")

print(f"\n💡 Key insight: The concatenation repair step is the bottleneck!")
print(f"   For 23GB libraries, skip it and get 10x+ speedup")

🚀 Testing NEW MODULAR OCR Cleaning System

📄 Loading Vankley academic paper...
   📏 Test sample: 50,000 characters

MODULAR PIPELINE PERFORMANCE TEST

🧪 Testing INDIVIDUAL components on Vankley:
----------------------------------------
🧪 Testing individual components:
OCR Artifacts         0.005s   50,000 →  46,716 chars ( -6.6%)
Academic Metadata     0.009s   50,000 →  49,487 chars ( -1.0%)
Book Metadata         0.005s   50,000 →  49,991 chars ( -0.0%)
French Concatenation  0.017s   50,000 →  50,001 chars ( +0.0%)
English Concatenation  0.016s   50,000 →  50,001 chars ( +0.0%)
Line Breaks           0.006s   50,000 →  48,666 chars ( -2.7%)

PIPELINE COMPARISON

📊 Pipeline Performance (text: 50,000 chars):
--------------------------------------------------
FAST         11.4ms    6.8x faster  45,393 chars
ACADEMIC     70.4ms    1.1x faster  44,910 chars
BOOK         65.7ms    1.2x faster  45,384 chars
AUTO         78.1ms    1.0x faster  44,910 chars

CUSTOM PIPELINE EXAMPLES

🚀 ULTRA-FAS