# Fix Paragraph Extraction in extract_document.py

This notebook shows how to fix the paragraph number extraction issue where too many paragraph numbers are being assigned to each chunk. The current implementation incorrectly uses element index as paragraph number.

## Problem
- Current code assigns `paragraph_number = i + 1` based on element index
- This creates incorrect paragraph numbering that doesn't reflect actual document structure
- Need to implement proper paragraph tracking by page and globally

## Solution
Replace the current paragraph tracking with a proper schema that:
1. Tracks paragraphs per page separately
2. Maintains a global paragraph counter
3. Uses element type changes to detect new paragraphs
4. Stores both page-specific and global paragraph numbers

In [None]:
# First, let's examine the current problematic code in extract_document.py
current_problematic_code = """
# Current problematic implementation:
metadata = {
    'element_index': i,
    'element_type': type(element).__name__,
    'page_number': 1,  # Default value
    'paragraph_number': None,  # This gets set incorrectly later
    # ... other metadata
}

# Later in the code:
if metadata['paragraph_number'] is None:
    metadata['paragraph_number'] = i + 1  # PROBLEM: Uses element index!
"""

print("Current problematic approach:")
print(current_problematic_code)

In [None]:
# Here's the improved paragraph tracking implementation
improved_implementation = """
def extract_text_from_pdf(pdf_path: str, strategy: str = "fast", **kwargs):
    # ... existing code ...
    
    # Initialize paragraph tracking
    paragraphs_by_page = {}  # Dict to track paragraph numbers by page
    last_element_type = None
    last_page_number = None
    global_paragraph_count = 0  # Track paragraphs across the entire document
    
    processed_elements = []
    
    for i, element in enumerate(elements):
        # ... existing element processing ...
        
        # Get page number from element metadata
        page_number = 1  # default
        if hasattr(element, 'metadata') and element.metadata:
            if hasattr(element.metadata, 'page_number') and element.metadata.page_number:
                page_number = element.metadata.page_number
        
        # Initialize paragraph counter for this page if needed
        if page_number not in paragraphs_by_page:
            paragraphs_by_page[page_number] = 0
            
        # Determine if this is a new paragraph
        element_type = type(element).__name__
        is_new_paragraph = False
        
        if page_number != last_page_number:  # New page = new paragraph
            is_new_paragraph = True
        elif element_type != last_element_type:  # Type change = new paragraph  
            is_new_paragraph = True
        elif (hasattr(element, 'category') and 
              element.category == "NarrativeText" and 
              last_element_type in ["Title", "ListItem"]):
            is_new_paragraph = True
            
        if is_new_paragraph:
            paragraphs_by_page[page_number] += 1
            global_paragraph_count += 1
            
        # Create metadata with proper paragraph tracking
        metadata = {
            'element_index': i,
            'element_type': element_type,
            'page_number': page_number,
            'paragraph_number': paragraphs_by_page[page_number],  # Page-specific paragraph
            'global_paragraph_number': global_paragraph_count,    # Global paragraph
            'paragraph_id': f"p{page_number}_para{paragraphs_by_page[page_number]}",
            'extraction_strategy': strategy,
            'filename': filename,
            'country': country,
            'document_title': f"{country} NDC"
        }
        
        # Update tracking variables
        last_element_type = element_type
        last_page_number = page_number
        
        processed_elements.append({
            'text': text,
            'metadata': metadata
        })
    
    return processed_elements
"""

print("Improved paragraph tracking implementation:")
print(improved_implementation)

## Key Changes Needed

1. **Remove element index-based paragraph numbering**: Stop using `i + 1` as paragraph number
2. **Add paragraph tracking by page**: Track paragraph numbers separately for each page
3. **Add global paragraph tracking**: Maintain a counter across the entire document
4. **Use element type changes to detect paragraphs**: Detect new paragraphs based on content structure
5. **Store both page and global paragraph numbers**: Provide both for different use cases

## Implementation Steps

The following code shows the exact changes needed in the `extract_text_from_pdf` function:

In [None]:
# Let's create the complete fixed function
def create_fixed_extract_function():
    return '''
def extract_text_from_pdf(
    pdf_path: str, 
    strategy: str = "fast", 
    extract_images: bool = False,
    infer_table_structure: bool = True,
    languages: str = "eng",
    **kwargs
) -> List[Dict[str, Any]]:
    """
    Extract text from a PDF file using the unstructured library with proper paragraph tracking.
    """
    # Validate and map strategy parameter
    valid_strategies = ["fast", "ocr_only", "auto"]
                
    if strategy not in valid_strategies:
        logger.warning(f"Invalid strategy '{strategy}', using 'fast' instead")
        strategy = "fast"

    try:
        filename = os.path.basename(pdf_path)
        country = _extract_country_from_filename(pdf_path)
        logger.info(f"Extracting text from PDF: {pdf_path} using strategy: {strategy}")
        logger.info(f"Detected country: {country}")
        
        languages_list = [lang.strip() for lang in languages.split(',')] if languages else ['eng']
        logger.debug(f"Using languages: {languages_list}")
        
        # Extract elements from the PDF
        elements = partition_pdf(
            filename=pdf_path, 
            strategy=strategy,
            extract_images_in_pdf=extract_images,
            infer_table_structure=infer_table_structure,
            languages=languages_list,
            **kwargs
        )
        
        logger.info(f"Extracted {len(elements)} elements from PDF")
        
        # Initialize paragraph tracking
        paragraphs_by_page = {}  # Dict to track paragraph numbers by page
        last_element_type = None
        last_page_number = None
        global_paragraph_count = 0  # Track paragraphs across the entire document
        
        # Process and filter elements
        processed_elements = []
        has_corruption = False
        
        for i, element in enumerate(elements):
            # Get text content
            text = str(element).strip() if element else ""
            
            # Check for PDF corruption/encoding issues
            if _has_character_corruption(text):
                has_corruption = True
                logger.debug(f"Detected character corruption in element {i}: {text[:100]}...")
                continue  # Skip corrupted elements
            
            # Skip very short or empty text
            if len(text) < 5:
                continue
            
            # Skip common PDF artifacts
            skip_patterns = [
                r'^\\d+$',  # Just page numbers
                r'^[ivxlcdm]+$',  # Roman numerals only
                r'^[\\s\\-_=\\.]+$',  # Just punctuation/whitespace
                r'^(page|pg)\\s*\\d+$',  # Page indicators
            ]
            
            if any(re.match(pattern, text.lower()) for pattern in skip_patterns):
                continue

            # Get page number from element metadata
            page_number = 1  # default
            try:
                if hasattr(element, 'metadata') and element.metadata:
                    if hasattr(element.metadata, 'page_number') and element.metadata.page_number is not None:
                        page_number = element.metadata.page_number
                    elif hasattr(element.metadata, '__dict__'):
                        meta_dict = element.metadata.__dict__
                        if 'page_number' in meta_dict and meta_dict['page_number'] is not None:
                            page_number = meta_dict['page_number']
            except Exception as meta_error:
                logger.warning(f"Error extracting page number from element {i}: {meta_error}")
            
            # Initialize paragraph counter for this page if needed
            if page_number not in paragraphs_by_page:
                paragraphs_by_page[page_number] = 0
                
            # Determine if this is a new paragraph
            element_type = type(element).__name__
            is_new_paragraph = False
            
            if page_number != last_page_number:  # New page = new paragraph
                is_new_paragraph = True
            elif element_type != last_element_type:  # Type change = new paragraph  
                is_new_paragraph = True
            elif (hasattr(element, 'category') and 
                  element.category == "NarrativeText" and 
                  last_element_type in ["Title", "ListItem"]):
                is_new_paragraph = True
                
            if is_new_paragraph:
                paragraphs_by_page[page_number] += 1
                global_paragraph_count += 1
            
            # Create metadata with proper paragraph tracking
            metadata = {
                'element_index': i,
                'element_type': element_type,
                'page_number': page_number,
                'paragraph_number': paragraphs_by_page[page_number],  # Page-specific paragraph
                'global_paragraph_number': global_paragraph_count,    # Global paragraph
                'paragraph_id': f"p{page_number}_para{paragraphs_by_page[page_number]}",
                'extraction_strategy': strategy,
                'filename': filename,
                'country': country,
                'document_title': f"{country} NDC"
            }
            
            # Extract additional metadata if available
            try:
                if hasattr(element, 'metadata') and element.metadata:
                    element_metadata = element.metadata
                    
                    # Extract coordinates if available
                    if hasattr(element_metadata, 'coordinates'):
                        metadata['coordinates'] = str(element_metadata.coordinates)
                    elif hasattr(element_metadata, '__dict__'):
                        meta_dict = element_metadata.__dict__
                        if 'coordinates' in meta_dict:
                            metadata['coordinates'] = str(meta_dict['coordinates'])
                    
                    # Extract filename if available
                    if hasattr(element_metadata, 'filename'):
                        metadata['source_file'] = element_metadata.filename
                    elif hasattr(element_metadata, '__dict__'):
                        meta_dict = element_metadata.__dict__
                        if 'filename' in meta_dict:
                            metadata['source_file'] = meta_dict['filename']
                                    
            except Exception as meta_error:
                logger.warning(f"Error extracting additional metadata from element {i}: {meta_error}")
            
            # Update tracking variables
            last_element_type = element_type
            last_page_number = page_number
            
            processed_elements.append({
                'text': text,
                'metadata': metadata
            })

        # Handle corruption and fallback logic (existing code)
        if has_corruption and len(processed_elements) < 5:
            logger.warning(f"Detected significant character corruption, attempting OCR extraction")
            return _retry_with_ocr(pdf_path, languages_list)
        
        logger.info(f"Successfully processed {len(processed_elements)} non-empty elements")
        
        if not processed_elements:
            logger.warning(f"No valid elements found with strategy {strategy}, attempting OCR fallback")
            return _retry_with_ocr(pdf_path, languages_list)

        return processed_elements
        
    except Exception as e:
        logger.error(f"Error extracting text from PDF {pdf_path}: {e}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        try:
            logger.info("Attempting OCR as last resort for failed extraction")
            return _retry_with_ocr(pdf_path, ['eng'])
        except:
            return []
'''

print("Complete fixed function:")
print(create_fixed_extract_function())

## Summary of Changes

### Before (Problematic):
```python
# Incorrect paragraph numbering based on element index
if metadata['paragraph_number'] is None:
    metadata['paragraph_number'] = i + 1
```

### After (Fixed):
```python
# Proper paragraph tracking with page-specific and global counters
paragraphs_by_page = {}
global_paragraph_count = 0

# Logic to detect new paragraphs based on content structure
if is_new_paragraph:
    paragraphs_by_page[page_number] += 1
    global_paragraph_count += 1

metadata = {
    'paragraph_number': paragraphs_by_page[page_number],
    'global_paragraph_number': global_paragraph_count,
    'paragraph_id': f"p{page_number}_para{paragraphs_by_page[page_number]}",
    # ... other metadata
}
```

This approach provides:
1. **Accurate paragraph numbering per page**
2. **Global paragraph tracking across the document**
3. **Paragraph IDs that reflect document structure**
4. **Better chunk metadata for downstream processing**

In [None]:
# Also need to fix the OCR retry function with the same logic
def create_fixed_ocr_function():
    return '''
def _retry_with_ocr(pdf_path: str, languages_list: list) -> List[Dict[str, Any]]:
    """
    Retry PDF extraction using OCR with proper paragraph tracking.
    """
    try:
        logger.info(f"Attempting OCR extraction for {pdf_path}")
        filename = os.path.basename(pdf_path)
        country = _extract_country_from_filename(pdf_path)
        
        # Force OCR extraction
        elements = partition_pdf(
            filename=pdf_path, 
            strategy="ocr_only",
            extract_images_in_pdf=False,
            infer_table_structure=True,
            languages=languages_list
        )
        
        # Initialize paragraph tracking for OCR
        paragraphs_by_page = {}
        last_element_type = None
        last_page_number = None
        global_paragraph_count = 0
        
        processed_elements = []
        
        for i, element in enumerate(elements):
            text = str(element).strip() if element else ""
            
            # Still check for corruption in OCR results
            if _has_character_corruption(text):
                logger.debug(f"OCR element {i} still has corruption, skipping")
                continue
            
            if len(text) < 10:
                continue

            # Get page number
            page_number = 1
            try:
                if hasattr(element, 'metadata') and element.metadata:
                    if hasattr(element.metadata, 'page_number') and element.metadata.page_number is not None:
                        page_number = element.metadata.page_number
                    elif hasattr(element.metadata, '__dict__'):
                        meta_dict = element.metadata.__dict__
                        if 'page_number' in meta_dict and meta_dict['page_number'] is not None:
                            page_number = meta_dict['page_number']
            except Exception:
                pass
            
            # Initialize paragraph counter for this page
            if page_number not in paragraphs_by_page:
                paragraphs_by_page[page_number] = 0
                
            # Determine if this is a new paragraph
            element_type = type(element).__name__
            is_new_paragraph = False
            
            if page_number != last_page_number:
                is_new_paragraph = True
            elif element_type != last_element_type:
                is_new_paragraph = True
                
            if is_new_paragraph:
                paragraphs_by_page[page_number] += 1
                global_paragraph_count += 1

            # Create metadata with proper paragraph tracking
            metadata = {
                'element_index': i,
                'element_type': element_type,
                'page_number': page_number,
                'paragraph_number': paragraphs_by_page[page_number],
                'global_paragraph_number': global_paragraph_count,
                'paragraph_id': f"p{page_number}_para{paragraphs_by_page[page_number]}",
                'extraction_strategy': 'ocr_fallback',
                'filename': filename,
                'country': country,
                'document_title': f"{country} NDC"
            }
            
            # Update tracking variables
            last_element_type = element_type
            last_page_number = page_number
            
            processed_elements.append({
                'text': text,
                'metadata': metadata
            })

        logger.info(f"OCR extraction completed with {len(processed_elements)} elements")
        return processed_elements
        
    except Exception as e:
        logger.error(f"OCR extraction failed for {pdf_path}: {e}")
        return []
'''

print("Fixed OCR retry function:")
print(create_fixed_ocr_function())