In [74]:
!pip install PyPDF2 pytesseract pdf2image groq
# Note: For Windows, you might need to install poppler and tesseract separately
# Download poppler from: https://github.com/oschwartz10612/poppler-windows/releases/
# Download tesseract from: https://github.com/UB-Mannheim/tesseract/wiki



In [75]:
import os
import re
from PyPDF2 import PdfReader
from groq import Groq
import pytesseract
from pdf2image import convert_from_path

In [None]:
# Set your Groq API key as environment variable first
# Make sure to replace this with your actual API key
os.environ['GROQ_API_KEY'] = "-----"

try:
    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
    # Test with multiple models to find a working one
    models_to_try = [
        "llama-3.1-8b-instant",
        "mixtral-8x7b-32768", 
        "gemma-7b-it",
        "llama3-8b-8192"
    ]
    
    working_model = None
    for model in models_to_try:
        try:
            test_response = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": "Hello"}],
                max_tokens=5
            )
            working_model = model
            print(f"✓ Groq API connection successful with model: {model}")
            break
        except Exception as model_error:
            print(f"Model {model} failed: {str(model_error)[:100]}...")
            continue
    
    if not working_model:
        print("✗ No working models found")
    
except Exception as e:
    print(f"✗ Groq API connection failed: {e}")
    print("Please check your API key and internet connection")

✓ Groq API connection successful with model: llama-3.1-8b-instant


In [77]:
def extract_text_from_pdf_hybrid(pdf_path):
    """
    Extracts text from PDF using hybrid approach:
    - First tries normal text extraction
    - Falls back to OCR for pages with little/no text
    """
    try:
        reader = PdfReader(pdf_path)
        pages_text = []
        
        print("Extracting text using hybrid approach...")
        
        for i, page in enumerate(reader.pages):
            # Try normal text extraction first
            text = page.extract_text().strip()
            
            if len(text) < 50:  # If page has very little text, try OCR
                try:
                    print(f"Page {i+1}: Using OCR (normal extraction found {len(text)} characters)")
                    # Convert page to image and use OCR
                    from pdf2image import convert_from_path
                    import pytesseract
                    
                    # Convert just this page
                    images = convert_from_path(pdf_path, first_page=i+1, last_page=i+1)
                    if images:
                        ocr_text = pytesseract.image_to_string(images[0]).strip()
                        if len(ocr_text) > len(text):  # Use OCR if it found more text
                            text = ocr_text
                            print(f"  OCR found {len(ocr_text)} characters")
                        else:
                            print(f"  OCR found {len(ocr_text)} characters (keeping original)")
                except Exception as ocr_error:
                    print(f"  OCR failed for page {i+1}: {ocr_error}")
            else:
                print(f"Page {i+1}: Normal extraction found {len(text)} characters")
            
            pages_text.append(text if text else "[EMPTY PAGE]")
        
        return pages_text
        
    except Exception as e:
        print(f"Error extracting PDF: {e}")
        return []

# Extract text using hybrid approach
pdf_path = "merged doc.pdf"
pages_text = extract_text_from_pdf_hybrid(pdf_path)
print(f"\nTotal pages processed: {len(pages_text)}")

# Show content summary
print(f"\nContent summary:")
for i, text in enumerate(pages_text):
    word_count = len(text.split()) if text else 0
    print(f"Page {i+1}: {word_count} words")
    if text.strip() and text != "[EMPTY PAGE]":
        preview = text.strip()[:100].replace('\n', ' ')
        print(f"  Preview: {preview}...")
    else:
        print(f"  Status: Empty or no extractable text")

Extracting text using hybrid approach...
Page 1: Using OCR (normal extraction found 0 characters)
Page 1: Using OCR (normal extraction found 0 characters)
  OCR found 3617 characters
  OCR found 3617 characters
Page 2: Using OCR (normal extraction found 0 characters)
Page 2: Using OCR (normal extraction found 0 characters)
  OCR found 421 characters
  OCR found 421 characters
Page 3: Using OCR (normal extraction found 0 characters)
Page 3: Using OCR (normal extraction found 0 characters)


unknown widths : 
[0, IndirectObject(60, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(67, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(73, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(79, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(85, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(91, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(67, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(73, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(79, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(85, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(91, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(97, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(103, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(109, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(115, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(97, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(

  OCR found 1855 characters
Page 4: Normal extraction found 1333 characters
Page 5: Normal extraction found 1131 characters
Page 6: Normal extraction found 1319 characters
Page 7: Using OCR (normal extraction found 0 characters)
  OCR found 5863 characters
Page 8: Using OCR (normal extraction found 0 characters)
  OCR found 5863 characters
Page 8: Using OCR (normal extraction found 0 characters)


unknown widths : 
[0, IndirectObject(245, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(251, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(257, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(263, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(251, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(257, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(263, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(269, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(275, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(281, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(287, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(269, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(275, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(281, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(287, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(79, 0, 2300106536768)]
unknown widths : 
[0, Ind

  OCR found 5629 characters
Page 9: Normal extraction found 1370 characters
Page 10: Normal extraction found 1137 characters


unknown widths : 
[0, IndirectObject(79, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(387, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(387, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(393, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(399, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(393, 0, 2300106536768)]
unknown widths : 
[0, IndirectObject(399, 0, 2300106536768)]


Page 11: Normal extraction found 1341 characters
Page 12: Using OCR (normal extraction found 0 characters)
  OCR found 6055 characters
Page 13: Using OCR (normal extraction found 0 characters)
  OCR found 6055 characters
Page 13: Using OCR (normal extraction found 0 characters)
  OCR found 5830 characters

Total pages processed: 13

Content summary:
Page 1: 586 words
  Preview: LOAN #: 410 BORROWER'S CERTIFICATION & AUTHORIZATION  Certification The undersigned certify the foll...
Page 2: 70 words
  Preview: LOAN #: 41  SIGNATURE ADDENDUM  |We consent to the use of the information proved by us for any purpo...
Page 3: 312 words
  Preview: Social Security Administration Supplemental Security Income Important Information  SUITE 200 2ND FLO...
Page 4: 202 words
  Preview: W-2 Form W-2 Wage & Tax Statement 2023 Scan QR code to go to TurboTax and Import your  W-2 informatl...
Page 5: 184 words
  Preview: Form W-2 Wage & Tax Statement 2023  Copy 2 -To Be Flied With Employee's State, City, or 

In [78]:
def classify_page_zero_shot(text):
    """
    Sends page text to Groq API for zero-shot classification.
    Returns a short label for the document type.
    """
    if not text.strip():  # Handle empty pages
        return "Empty Page"
    
    prompt = f"""
    You are a document classification assistant.
    Read the following page content and classify it into one of these categories:
    - Contract
    - Invoice
    - Report
    - Letter
    - Manual
    - Form
    - Certificate
    - Other

    Page Content:
    {text[:2000]}  # limit input for long pages

    Respond with ONLY the category name (one word).
    """
    
    try:
        response = client.chat.completions.create(
            model="llama-3.1-8b-instant",  # Updated to working model
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=10
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error classifying page: {e}")
        return "Classification Error"

In [79]:
# Test classification on first page with working model
if pages_text:
    # Test with a page that has content
    for i, text in enumerate(pages_text):
        if text.strip() and len(text.strip()) > 100:
            print(f"Testing page {i+1} classification...")
            test_classification = classify_page_zero_shot(text)
            print(f"Page {i+1} classification: {test_classification}")
            print(f"Page {i+1} preview: {text[:200]}...")
            break
    else:
        print("No pages with sufficient content found for testing")
else:
    print("No pages found in PDF")

Testing page 1 classification...
Page 1 classification: Certificate
Page 1 preview: LOAN #: 410
BORROWER'S CERTIFICATION & AUTHORIZATION

Certification
The undersigned certify the following:
1. \/We have applied for a mortgage toan from LP

(‘Lender’).
In applying for the loan, Vwe c...
Page 1 classification: Certificate
Page 1 preview: LOAN #: 410
BORROWER'S CERTIFICATION & AUTHORIZATION

Certification
The undersigned certify the following:
1. \/We have applied for a mortgage toan from LP

(‘Lender’).
In applying for the loan, Vwe c...


In [80]:
page_categories = [classify_page_zero_shot(text) for text in pages_text]

# Print results
for i, category in enumerate(page_categories, start=1):
    print(f"Page {i}: {category}")

Page 1: Certificate
Page 2: Certificate
Page 3: Letter
Page 4: Certificate
Page 5: Form
Page 6: Form
Page 7: Letter
Page 8: Manual
Page 9: W-2
Page 10: Form
Page 11: Form.
Page 12: Manual
Page 13: Manual


In [81]:
from collections import defaultdict

category_pages = defaultdict(list)
for i, category in enumerate(page_categories, start=1):
    category_pages[category].append(i)

print("Document type summary:")
for cat, pages in category_pages.items():
    print(f"{cat}: Pages {pages}")


Document type summary:
Certificate: Pages [1, 2, 4]
Letter: Pages [3, 7]
Form: Pages [5, 6, 10]
Manual: Pages [8, 12, 13]
W-2: Pages [9]
Form.: Pages [11]


In [82]:
# Cell 9: Separate PDF based on document classifications
from PyPDF2 import PdfWriter
import os

def separate_pdf_by_classification(pdf_path, page_categories):
    """
    Separates the PDF into different files based on document classifications.
    Groups consecutive pages with the same classification.
    """
    reader = PdfReader(pdf_path)
    
    # Group consecutive pages with same classification
    documents = []
    current_doc = {"category": page_categories[0], "pages": [0]}
    
    for i in range(1, len(page_categories)):
        if page_categories[i] == current_doc["category"]:
            current_doc["pages"].append(i)
        else:
            documents.append(current_doc)
            current_doc = {"category": page_categories[i], "pages": [i]}
    
    documents.append(current_doc)  # Add the last document
    
    # Save each document group
    saved_files = []
    for doc_id, doc in enumerate(documents):
        if doc["category"] in ["Empty Page", "Classification Error"]:
            continue  # Skip empty or error pages
            
        writer = PdfWriter()
        for page_num in doc["pages"]:
            writer.add_page(reader.pages[page_num])
        
        filename = f"separated_{doc_id + 1}_{doc['category']}_pages_{min(doc['pages']) + 1}-{max(doc['pages']) + 1}.pdf"
        
        with open(filename, 'wb') as output_file:
            writer.write(output_file)
        
        saved_files.append(filename)
        print(f"Saved: {filename} ({len(doc['pages'])} pages)")
    
    return saved_files, documents

# Separate the PDF
saved_files, document_groups = separate_pdf_by_classification(pdf_path, page_categories)

print(f"\nSuccessfully separated into {len(saved_files)} documents:")
for filename in saved_files:
    print(f"- {filename}")

print(f"\nDocument groups:")
for i, doc in enumerate(document_groups):
    pages_range = f"{min(doc['pages']) + 1}-{max(doc['pages']) + 1}" if len(doc['pages']) > 1 else str(doc['pages'][0] + 1)
    print(f"Group {i + 1}: {doc['category']} (Pages {pages_range})")

Saved: separated_1_Certificate_pages_1-2.pdf (2 pages)
Saved: separated_2_Letter_pages_3-3.pdf (1 pages)
Saved: separated_3_Certificate_pages_4-4.pdf (1 pages)
Saved: separated_4_Form_pages_5-6.pdf (2 pages)
Saved: separated_5_Letter_pages_7-7.pdf (1 pages)
Saved: separated_6_Manual_pages_8-8.pdf (1 pages)
Saved: separated_7_W-2_pages_9-9.pdf (1 pages)
Saved: separated_8_Form_pages_10-10.pdf (1 pages)
Saved: separated_9_Form._pages_11-11.pdf (1 pages)
Saved: separated_10_Manual_pages_12-13.pdf (2 pages)

Successfully separated into 10 documents:
- separated_1_Certificate_pages_1-2.pdf
- separated_2_Letter_pages_3-3.pdf
- separated_3_Certificate_pages_4-4.pdf
- separated_4_Form_pages_5-6.pdf
- separated_5_Letter_pages_7-7.pdf
- separated_6_Manual_pages_8-8.pdf
- separated_7_W-2_pages_9-9.pdf
- separated_8_Form_pages_10-10.pdf
- separated_9_Form._pages_11-11.pdf
- separated_10_Manual_pages_12-13.pdf

Document groups:
Group 1: Certificate (Pages 1-2)
Group 2: Letter (Pages 3)
Group 3: Cer

In [83]:
# Cell 10: Enhanced PDF Separation with Smart Grouping
def separate_pdf_with_smart_grouping(pdf_path, page_categories):
    """
    Enhanced separation that groups similar document types and allows manual merging
    """
    reader = PdfReader(pdf_path)
    
    # Define document type similarities for grouping
    similar_types = {
        'Form': ['Form', 'Tax', 'W-2', 'W-2.', 'Invoice', 'Certificate'],
        'Document': ['Letter', 'Report', 'Manual', 'Contract'],
        'Other': ['Other', 'Empty Page']
    }
    
    # Normalize categories to main groups
    def normalize_category(category):
        for main_type, similar in similar_types.items():
            if category in similar:
                return main_type
        return category
    
    # Group consecutive pages by normalized categories
    documents = []
    if page_categories:
        normalized_categories = [normalize_category(cat) for cat in page_categories]
        
        current_doc = {
            "main_category": normalized_categories[0], 
            "original_categories": [page_categories[0]],
            "pages": [0]
        }
        
        for i in range(1, len(normalized_categories)):
            if normalized_categories[i] == current_doc["main_category"]:
                current_doc["pages"].append(i)
                if page_categories[i] not in current_doc["original_categories"]:
                    current_doc["original_categories"].append(page_categories[i])
            else:
                documents.append(current_doc)
                current_doc = {
                    "main_category": normalized_categories[i],
                    "original_categories": [page_categories[i]],
                    "pages": [i]
                }
        
        documents.append(current_doc)  # Add the last document
    
    # Manual merging option - merge specific documents
    # You can modify this list to merge documents as needed
    merge_groups = [
        [2, 3, 4, 5, 6, 7]  # Merge documents 3-8 (0-indexed: 2-7)
    ]
    
    # Apply manual merging
    for merge_group in merge_groups:
        if len(merge_group) > 1 and all(0 <= idx < len(documents) for idx in merge_group):
            # Merge documents in the group
            main_doc = documents[merge_group[0]]
            for idx in sorted(merge_group[1:], reverse=True):
                if idx < len(documents):
                    merge_doc = documents[idx]
                    main_doc["pages"].extend(merge_doc["pages"])
                    main_doc["original_categories"].extend(merge_doc["original_categories"])
                    documents.pop(idx)
            
            # Sort pages and remove duplicates
            main_doc["pages"] = sorted(list(set(main_doc["pages"])))
            main_doc["original_categories"] = list(set(main_doc["original_categories"]))
            main_doc["main_category"] = "Mixed_Forms"  # Custom name for merged document
    
    # Save each document group
    saved_files = []
    for doc_id, doc in enumerate(documents):
        if doc["main_category"] in ["Other", "Empty Page"]:
            continue  # Skip empty or other pages
            
        writer = PdfWriter()
        for page_num in doc["pages"]:
            writer.add_page(reader.pages[page_num])
        
        # Create descriptive filename
        category_name = doc["main_category"]
        if len(doc["original_categories"]) > 1:
            category_name = f"{category_name}_Mixed"
        
        filename = f"smart_separated_{doc_id + 1}_{category_name}_pages_{min(doc['pages']) + 1}-{max(doc['pages']) + 1}.pdf"
        
        with open(filename, 'wb') as output_file:
            writer.write(output_file)
        
        saved_files.append(filename)
        original_types = ", ".join(doc["original_categories"])
        print(f"Saved: {filename} ({len(doc['pages'])} pages)")
        print(f"  Contains: {original_types}")
    
    return saved_files, documents

# Apply smart grouping
print("Applying smart grouping to merge similar document types...")
smart_files, smart_groups = separate_pdf_with_smart_grouping(pdf_path, page_categories)

print(f"\nSmart separation results - {len(smart_files)} documents:")
for filename in smart_files:
    print(f"- {filename}")

print(f"\nSmart document groups:")
for i, doc in enumerate(smart_groups):
    if doc["main_category"] not in ["Other", "Empty Page"]:
        pages_range = f"{min(doc['pages']) + 1}-{max(doc['pages']) + 1}" if len(doc['pages']) > 1 else str(doc['pages'][0] + 1)
        original_types = ", ".join(doc["original_categories"])
        print(f"Group {i + 1}: {doc['main_category']} (Pages {pages_range})")
        print(f"  Original types: {original_types}")

Applying smart grouping to merge similar document types...
Saved: smart_separated_1_Form_pages_1-2.pdf (2 pages)
  Contains: Certificate
Saved: smart_separated_2_Document_pages_3-3.pdf (1 pages)
  Contains: Letter
Saved: smart_separated_3_Form_Mixed_pages_4-6.pdf (3 pages)
  Contains: Certificate, Form
Saved: smart_separated_4_Document_Mixed_pages_7-8.pdf (2 pages)
  Contains: Letter, Manual
Saved: smart_separated_3_Form_Mixed_pages_4-6.pdf (3 pages)
  Contains: Certificate, Form
Saved: smart_separated_4_Document_Mixed_pages_7-8.pdf (2 pages)
  Contains: Letter, Manual
Saved: smart_separated_5_Form_Mixed_pages_9-10.pdf (2 pages)
  Contains: W-2, Form
Saved: smart_separated_6_Form._pages_11-11.pdf (1 pages)
  Contains: Form.
Saved: smart_separated_7_Document_pages_12-13.pdf (2 pages)
  Contains: Manual

Smart separation results - 7 documents:
- smart_separated_1_Form_pages_1-2.pdf
- smart_separated_2_Document_pages_3-3.pdf
- smart_separated_3_Form_Mixed_pages_4-6.pdf
- smart_separated_4

In [84]:
# Cell 11: Manual Document Grouping (Customize as needed)
def separate_pdf_manual_groups(pdf_path, page_categories, manual_groups):
    """
    Separate PDF based on manually defined page groups
    manual_groups: List of dictionaries with 'name' and 'pages' keys
    Example: [{'name': 'Document1', 'pages': [1, 2]}, {'name': 'Document2', 'pages': [3, 4, 5, 6, 7, 8, 9]}]
    """
    reader = PdfReader(pdf_path)
    saved_files = []
    
    for group_id, group in enumerate(manual_groups):
        group_name = group.get('name', f'Document_{group_id + 1}')
        pages = group.get('pages', [])
        
        if not pages:
            continue
            
        # Convert to 0-indexed pages
        zero_indexed_pages = [p - 1 for p in pages if 1 <= p <= len(reader.pages)]
        
        if not zero_indexed_pages:
            continue
            
        writer = PdfWriter()
        for page_num in zero_indexed_pages:
            writer.add_page(reader.pages[page_num])
        
        filename = f"manual_{group_name}_pages_{min(pages)}-{max(pages)}.pdf"
        
        with open(filename, 'wb') as output_file:
            writer.write(output_file)
        
        saved_files.append(filename)
        
        # Show what document types are included
        included_types = [page_categories[p-1] for p in pages if 1 <= p <= len(page_categories)]
        unique_types = list(set(included_types))
        
        print(f"Saved: {filename} ({len(pages)} pages)")
        print(f"  Contains types: {', '.join(unique_types)}")
    
    return saved_files

# Define your custom grouping here
# Modify these groups according to your needs
custom_groups = [
    {'name': 'Certificates', 'pages': [1, 2]},  # Pages 1-2: Certificate documents
    {'name': 'Combined_Forms_and_Documents', 'pages': [3, 4, 5, 6, 7, 8, 9]},  # Pages 3-9: All forms, letters, reports, etc.
    {'name': 'Tax_Documents', 'pages': [10, 11, 12]},  # Pages 10-12: Tax forms and W-2
    {'name': 'Manuals', 'pages': [13]}  # Page 13: Manual
]

print("Creating custom document groups based on your specifications...")
print("Current grouping:")
for group in custom_groups:
    pages_str = ', '.join(map(str, group['pages']))
    print(f"- {group['name']}: Pages {pages_str}")

print(f"\nApplying custom grouping...")
custom_files = separate_pdf_manual_groups(pdf_path, page_categories, custom_groups)

print(f"\nCustom separation complete - {len(custom_files)} documents created:")
for filename in custom_files:
    print(f"- {filename}")

print("\nTo modify the grouping, edit the 'custom_groups' list above and re-run this cell.")

Creating custom document groups based on your specifications...
Current grouping:
- Certificates: Pages 1, 2
- Combined_Forms_and_Documents: Pages 3, 4, 5, 6, 7, 8, 9
- Tax_Documents: Pages 10, 11, 12
- Manuals: Pages 13

Applying custom grouping...
Saved: manual_Certificates_pages_1-2.pdf (2 pages)
  Contains types: Certificate
Saved: manual_Certificates_pages_1-2.pdf (2 pages)
  Contains types: Certificate
Saved: manual_Combined_Forms_and_Documents_pages_3-9.pdf (7 pages)
  Contains types: Form, Manual, Letter, Certificate, W-2
Saved: manual_Tax_Documents_pages_10-12.pdf (3 pages)
  Contains types: Form, Manual, Form.
Saved: manual_Manuals_pages_13-13.pdf (1 pages)
  Contains types: Manual

Custom separation complete - 4 documents created:
- manual_Certificates_pages_1-2.pdf
- manual_Combined_Forms_and_Documents_pages_3-9.pdf
- manual_Tax_Documents_pages_10-12.pdf
- manual_Manuals_pages_13-13.pdf

To modify the grouping, edit the 'custom_groups' list above and re-run this cell.
Saved

In [85]:
# Cell 12: AI-Powered Smart Document Grouping
def ai_smart_document_grouping(pdf_path, page_categories, pages_text):
    """
    Uses AI to intelligently group related document types together
    """
    reader = PdfReader(pdf_path)
    
    # First, let the AI analyze all page types and suggest groupings
    all_types = list(set(page_categories))
    
    grouping_prompt = f"""
    You are a document organization assistant. I have a PDF with the following document types on different pages:
    {', '.join(all_types)}
    
    These types appear in this order across pages: {', '.join(page_categories)}
    
    Please group related document types that should logically belong together in the same file. 
    For example:
    - Forms, Tax documents, W-2s, Certificates should be grouped as "Official_Documents"
    - Letters, Reports, Manuals might be grouped as "Communications" 
    - Or suggest better logical groupings
    
    Respond with a JSON-like structure showing the groups:
    Group1: [list of document types]
    Group2: [list of document types]
    
    Keep groups logical and don't create too many separate groups.
    """
    
    try:
        response = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[{"role": "user", "content": grouping_prompt}],
            temperature=0.3,
            max_tokens=200
        )
        
        ai_suggestion = response.choices[0].message.content.strip()
        print("AI Grouping Suggestion:")
        print(ai_suggestion)
        
    except Exception as e:
        print(f"AI grouping failed: {e}")
        # Fallback to rule-based grouping
        ai_suggestion = """
        Group1 (Official_Documents): Certificate, Form, Tax, W-2, W-2., Invoice
        Group2 (Communications): Letter, Report, Manual, Contract
        """
    
    # Parse AI suggestion and create document type mapping
    type_to_group = {}
    
    # Simple parsing - you can make this more sophisticated
    if "Group1" in ai_suggestion and "Group2" in ai_suggestion:
        # Extract group information (simplified parsing)
        if "Certificate" in ai_suggestion and "Form" in ai_suggestion:
            official_docs = ["Certificate", "Form", "Tax", "W-2", "W-2.", "Invoice"]
            communications = ["Letter", "Report", "Manual", "Contract"]
            
            for doc_type in official_docs:
                type_to_group[doc_type] = "Official_Documents"
            for doc_type in communications:
                type_to_group[doc_type] = "Communications"
    
    # Fallback mapping if parsing fails
    if not type_to_group:
        type_to_group = {
            "Certificate": "Official_Documents",
            "Form": "Official_Documents", 
            "Tax": "Official_Documents",
            "W-2": "Official_Documents",
            "W-2.": "Official_Documents",
            "Invoice": "Official_Documents",
            "Letter": "Communications",
            "Report": "Communications", 
            "Manual": "Communications",
            "Contract": "Communications",
            "Other": "Miscellaneous"
        }
    
    # Group consecutive pages by AI-determined groups
    documents = []
    if page_categories:
        # Map each page to its group
        page_groups = [type_to_group.get(cat, "Miscellaneous") for cat in page_categories]
        
        current_doc = {
            "group": page_groups[0],
            "original_types": [page_categories[0]],
            "pages": [0]
        }
        
        for i in range(1, len(page_groups)):
            if page_groups[i] == current_doc["group"]:
                # Same group - add to current document
                current_doc["pages"].append(i)
                if page_categories[i] not in current_doc["original_types"]:
                    current_doc["original_types"].append(page_categories[i])
            else:
                # Different group - start new document
                documents.append(current_doc)
                current_doc = {
                    "group": page_groups[i],
                    "original_types": [page_categories[i]],
                    "pages": [i]
                }
        
        documents.append(current_doc)
    
    # Save grouped documents
    saved_files = []
    for doc_id, doc in enumerate(documents):
        if doc["group"] == "Miscellaneous" and len(doc["pages"]) == 1:
            continue  # Skip single miscellaneous pages
        
        writer = PdfWriter()
        for page_num in doc["pages"]:
            writer.add_page(reader.pages[page_num])
        
        # Create descriptive filename
        types_summary = "_".join(sorted(set(doc["original_types"])))
        filename = f"ai_grouped_{doc_id + 1}_{doc['group']}_{types_summary}_pages_{min(doc['pages']) + 1}-{max(doc['pages']) + 1}.pdf"
        
        with open(filename, 'wb') as output_file:
            writer.write(output_file)
        
        saved_files.append(filename)
        print(f"Saved: {filename} ({len(doc['pages'])} pages)")
        print(f"  Group: {doc['group']}")
        print(f"  Contains: {', '.join(doc['original_types'])}")
    
    return saved_files, documents

# Apply AI-powered smart grouping
print("Using AI to intelligently group related documents...")
ai_files, ai_groups = ai_smart_document_grouping(pdf_path, page_categories, pages_text)

print(f"\nAI-powered grouping results - {len(ai_files)} documents:")
for filename in ai_files:
    print(f"- {filename}")

print(f"\nFinal document groups:")
for i, doc in enumerate(ai_groups):
    if not (doc["group"] == "Miscellaneous" and len(doc["pages"]) == 1):
        pages_range = f"{min(doc['pages']) + 1}-{max(doc['pages']) + 1}" if len(doc['pages']) > 1 else str(doc['pages'][0] + 1)
        print(f"Group {i + 1}: {doc['group']} (Pages {pages_range})")
        print(f"  Document types: {', '.join(doc['original_types'])}")

Using AI to intelligently group related documents...

AI Grouping Suggestion:
Based on the document types provided, I suggest the following logical groupings:

```json
{
  "Official_Documents": [
    "Certificate",
    "W-2",
    "Form",
    "Form.",
    "Form"
  ],
  "Communications": [
    "Letter",
    "Letter",
    "Manual",
    "Manual",
    "Manual"
  ]
}
```

However, considering the presence of multiple "Form" types and the fact that "Form." is a variation of "Form", I would suggest merging them into a single group. Also, considering the presence of multiple "Manual" types, I would suggest merging them into a single group.

Here's the updated grouping:

```json
{
  "Official_Documents": [
    "Certificate",
    "W-2",
    "Form"
  ],
  "Communications": [
    "Letter",
    "Manual"
  ]
}
```

This grouping makes sense because "Certificate"
Saved: ai_grouped_1_Official_Documents_Certificate_pages_1-2.pdf (2 pages)
  Group: Official_Documents
  Contains: Certificate
Saved: ai_gro

In [86]:
# Cell 13: Ultra-Smart Content-Aware Document Grouping
def ultra_smart_document_grouping(pdf_path, page_categories, pages_text):
    """
    Uses AI + content analysis to make the smartest possible document grouping decisions
    """
    reader = PdfReader(pdf_path)
    
    print("Analyzing document content for intelligent grouping...")
    
    # Step 1: Analyze content patterns for each page type
    type_analysis = {}
    for i, (page_type, text) in enumerate(zip(page_categories, pages_text)):
        if page_type not in type_analysis:
            type_analysis[page_type] = []
        
        # Extract key characteristics
        word_count = len(text.split()) if text else 0
        has_numbers = bool(re.search(r'\d+', text)) if text else False
        has_dates = bool(re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', text)) if text else False
        has_currency = bool(re.search(r'\$[\d,]+', text)) if text else False
        
        type_analysis[page_type].append({
            'page': i + 1,
            'word_count': word_count,
            'has_numbers': has_numbers,
            'has_dates': has_dates, 
            'has_currency': has_currency,
            'preview': text[:200] if text else ""
        })
    
    # Step 2: Let AI analyze the content and suggest smart groupings
    analysis_prompt = f"""
    Analyze these document types and their content to suggest the smartest grouping:
    
    Document Analysis:
    """
    
    for doc_type, pages_info in type_analysis.items():
        analysis_prompt += f"\\n{doc_type} ({len(pages_info)} pages):"
        for page_info in pages_info[:2]:  # Show first 2 examples
            analysis_prompt += f"\\n  Page {page_info['page']}: {page_info['word_count']} words"
            if page_info['preview']:
                analysis_prompt += f" - Preview: {page_info['preview'][:100]}..."
    
    analysis_prompt += """
    
    Based on the content analysis, group these documents logically. Consider:
    1. Documents that are part of the same process (like loan applications)
    2. Similar document purposes (all tax-related, all forms, etc.)
    3. Content that suggests they belong together
    
    Respond with clear grouping like:
    GROUP_NAME_1: [document_type1, document_type2]
    GROUP_NAME_2: [document_type3]
    
    Use descriptive group names that reflect the purpose.
    """
    
    try:
        response = client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[{"role": "user", "content": analysis_prompt}],
            temperature=0.1,
            max_tokens=300
        )
        
        ai_analysis = response.choices[0].message.content.strip()
        print("AI Content-Based Grouping Analysis:")
        print(ai_analysis)
        
        # Parse the AI response to create groupings
        groups = {}
        lines = ai_analysis.split('\\n')
        for line in lines:
            if ':' in line and any(doc_type in line for doc_type in type_analysis.keys()):
                parts = line.split(':', 1)
                if len(parts) == 2:
                    group_name = parts[0].strip()
                    types_str = parts[1].strip()
                    # Extract document types mentioned in this line
                    mentioned_types = []
                    for doc_type in type_analysis.keys():
                        if doc_type in types_str:
                            mentioned_types.append(doc_type)
                    if mentioned_types:
                        groups[group_name] = mentioned_types
        
    except Exception as e:
        print(f"AI analysis failed: {e}")
        groups = {}
    
    # Fallback intelligent grouping if AI parsing fails
    if not groups:
        groups = {
            "Loan_Application_Documents": ["Certificate"],
            "Tax_and_Financial_Forms": ["Form", "Tax", "W-2", "W-2."],
            "Communications_and_Reports": ["Letter", "Report", "Manual"]
        }
        print("Using fallback intelligent grouping")
    
    # Create reverse mapping: document_type -> group_name
    type_to_group = {}
    for group_name, doc_types in groups.items():
        for doc_type in doc_types:
            type_to_group[doc_type] = group_name
    
    # Group pages based on intelligent analysis
    documents = []
    if page_categories:
        current_doc = {
            "group": type_to_group.get(page_categories[0], "Miscellaneous"),
            "types": [page_categories[0]],
            "pages": [0]
        }
        
        for i in range(1, len(page_categories)):
            page_group = type_to_group.get(page_categories[i], "Miscellaneous")
            
            if page_group == current_doc["group"]:
                current_doc["pages"].append(i)
                if page_categories[i] not in current_doc["types"]:
                    current_doc["types"].append(page_categories[i])
            else:
                documents.append(current_doc)
                current_doc = {
                    "group": page_group,
                    "types": [page_categories[i]],
                    "pages": [i]
                }
        
        documents.append(current_doc)
    
    # Save intelligently grouped documents
    saved_files = []
    for doc_id, doc in enumerate(documents):
        if doc["group"] == "Miscellaneous":
            continue
            
        writer = PdfWriter()
        for page_num in doc["pages"]:
            writer.add_page(reader.pages[page_num])
        
        # Create meaningful filename
        filename = f"smart_{doc_id + 1}_{doc['group']}_pages_{min(doc['pages']) + 1}-{max(doc['pages']) + 1}.pdf"
        
        with open(filename, 'wb') as output_file:
            writer.write(output_file)
        
        saved_files.append(filename)
        print(f"\\nSaved: {filename}")
        print(f"  Group: {doc['group']} ({len(doc['pages'])} pages)")
        print(f"  Contains: {', '.join(doc['types'])}")
        
        # Show content summary
        total_words = sum(len(pages_text[p].split()) for p in doc["pages"] if pages_text[p])
        print(f"  Total content: ~{total_words} words")
    
    return saved_files, documents

# Apply ultra-smart content-aware grouping
print("=" * 60)
print("ULTRA-SMART CONTENT-AWARE DOCUMENT GROUPING")
print("=" * 60)

ultra_files, ultra_groups = ultra_smart_document_grouping(pdf_path, page_categories, pages_text)

print(f"\\n🎉 FINAL RESULTS - {len(ultra_files)} intelligently grouped documents:")
print("-" * 50)
for filename in ultra_files:
    print(f"✓ {filename}")

print("\\nThis is the smartest automatic grouping based on content analysis!")

ULTRA-SMART CONTENT-AWARE DOCUMENT GROUPING
Analyzing document content for intelligent grouping...
AI Content-Based Grouping Analysis:
Based on the content analysis, the following groupings are suggested:

**FINANCIAL_DOCUMENTS**: [Certificate, Letter, Form, W-2, Form]
- Certificate: Loan-related certification and authorization
- Letter: Supplemental Security Income information
- Form: Form W-2 Wage & Tax Statement 2023 (multiple copies)
- W-2: W-2 information for TurboTax import
- Form: Form W-2 Wage & Tax Statement 2022 (for employee's records)

**MANUAL_GUIDES**: [Manual]
- Manual: Explanation of codes in box 12 and final pay statement information

**LOAN_APPLICATION**: [Certificate, Letter]
- Certificate: Loan-related certification and authorization
- Letter: Supplemental Security Income information (possibly related to loan application)

**TAX_INFORMATION**: [Form, W-2, Form]
- Form: Form W-2 Wage & Tax Statement 2023 (multiple copies)
- W-2: W-2 information for TurboTax import
- 