# BCG Multi-Agent & Multimodal AI Platform - Data Ingestion

This notebook demonstrates the data ingestion process for BCG Sustainability Reports, including:
1. PDF text extraction
2. Visual element detection (charts, graphs)
3. Document processing and structuring

In [None]:
import os
import sys
import logging
from pathlib import Path

# Add the project root to the Python path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

## 1. Setup and Configuration

In [None]:
from configs.config import (
    RAW_DATA_DIR,
    PROCESSED_DATA_DIR,
    CHUNK_SIZE,
    CHUNK_OVERLAP,
    USE_OCR,
    EXTRACT_CHARTS,
)

# Ensure the raw data directory exists
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Raw data directory: {RAW_DATA_DIR}")
print(f"Processed data directory: {PROCESSED_DATA_DIR}")
print(f"Text chunk size: {CHUNK_SIZE}, overlap: {CHUNK_OVERLAP}")
print(f"OCR enabled: {USE_OCR}")
print(f"Chart extraction enabled: {EXTRACT_CHARTS}")

## 2. Upload or Locate BCG Sustainability Reports

For this demo, we need to ensure the BCG Sustainability Reports are in the `data/raw` directory. You can upload them or verify their presence.

In [None]:
# List available PDF files in the raw data directory
pdf_files = list(RAW_DATA_DIR.glob("*.pdf"))

if len(pdf_files) == 0:
    print("No PDF files found in the raw data directory.")
    print(f"Please upload BCG Sustainability Reports to {RAW_DATA_DIR}")
else:
    print(f"Found {len(pdf_files)} PDF files:")
    for pdf_file in pdf_files:
        print(f" - {pdf_file.name}")

## 3. Test PDF Text Extraction

In [None]:
from src.data_ingestion.text_extraction.pdf_extractor import PDFTextExtractor

# Initialize the PDF text extractor
text_extractor = PDFTextExtractor(
    use_unstructured=True,
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
)

# Test text extraction on the first PDF (if available)
if len(pdf_files) > 0:
    test_pdf = pdf_files[0]
    print(f"Testing text extraction on {test_pdf.name}")
    
    # Get metadata
    metadata = text_extractor.get_document_metadata(test_pdf)
    print("\nDocument Metadata:")
    for key, value in metadata.items():
        print(f" - {key}: {value}")
    
    # Extract and chunk text
    chunks = text_extractor.extract_and_chunk_text(test_pdf)
    print(f"\nExtracted {len(chunks)} text chunks")
    
    # Display the first chunk
    if chunks:
        print("\nSample text chunk:")
        print(chunks[0][:500] + "...")
else:
    print("No PDF files available for testing text extraction.")

## 4. Test Chart Detection

In [None]:
from src.data_ingestion.visual_extraction.chart_detector import ChartDetector
import matplotlib.pyplot as plt

# Initialize the chart detector
chart_detector = ChartDetector(confidence_threshold=0.5)  # Lower threshold for demo

# Test chart detection on the first PDF (if available)
if len(pdf_files) > 0:
    test_pdf = pdf_files[0]
    print(f"Testing chart detection on {test_pdf.name}")
    
    # Detect charts
    visual_elements = chart_detector.detect_charts(test_pdf)
    print(f"\nDetected {len(visual_elements)} potential visual elements")
    
    # Display the first few visual elements
    max_display = min(3, len(visual_elements))
    if visual_elements:
        plt.figure(figsize=(15, 5 * max_display))
        for i, element in enumerate(visual_elements[:max_display]):
            plt.subplot(max_display, 1, i + 1)
            plt.imshow(element.image)
            plt.title(f"{element.element_type} on page {element.page_num} (confidence: {element.confidence_score:.2f})")
            plt.axis('off')
        plt.tight_layout()
        plt.show()
else:
    print("No PDF files available for testing chart detection.")

## 5. Full Document Processing

In [None]:
from src.data_ingestion.document_processor import DocumentProcessor

# Initialize the document processor
document_processor = DocumentProcessor(
    use_unstructured=True,
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    extract_visuals=True,
    confidence_threshold=0.5,  # Lower threshold for demo
)

# Process the first PDF (if available)
if len(pdf_files) > 0:
    test_pdf = pdf_files[0]
    print(f"Processing document: {test_pdf.name}")
    
    # Process the document
    processed_doc = document_processor.process_document(
        pdf_path=test_pdf,
        output_dir=PROCESSED_DATA_DIR,
        save_visuals=True,
    )
    
    print(f"\nProcessed document: {processed_doc.document_id}")
    print(f"Text chunks: {len(processed_doc.text_chunks)}")
    print(f"Visual elements: {len(processed_doc.visual_elements)}")
    print("\nMetadata:")
    for key, value in processed_doc.metadata.items():
        print(f" - {key}: {value}")
else:
    print("No PDF files available for document processing.")

## 6. Process All BCG Sustainability Reports

In [None]:
# Process all PDF files in the raw data directory
if len(pdf_files) > 0:
    print(f"Processing all {len(pdf_files)} PDF files...")
    processed_docs = document_processor.process_directory(
        input_dir=RAW_DATA_DIR,
        output_dir=PROCESSED_DATA_DIR,
        file_pattern="*.pdf",
    )
    
    print(f"\nSuccessfully processed {len(processed_docs)} documents:")
    for doc in processed_docs:
        print(f" - {doc.filename}: {len(doc.text_chunks)} chunks, {len(doc.visual_elements)} visual elements")
else:
    print("No PDF files available for processing.")

## 7. Explore Processed Document Content

In [None]:
import random

# Explore the content of processed documents (if available)
processed_dirs = [d for d in PROCESSED_DATA_DIR.iterdir() if d.is_dir()]

if processed_dirs:
    # Select a random processed document
    sample_doc_dir = random.choice(processed_dirs)
    json_file = list(sample_doc_dir.glob("*.json"))
    
    if json_file:
        import json
        
        with open(json_file[0], "r", encoding="utf-8") as f:
            doc_data = json.load(f)
        
        print(f"Exploring document: {doc_data['filename']}")
        print(f"Document ID: {doc_data['document_id']}")
        print(f"Total text chunks: {len(doc_data['text_chunks'])}")
        print(f"Total visual elements: {len(doc_data['visual_elements'])}")
        
        # Display a random text chunk
        if doc_data['text_chunks']:
            sample_chunk = random.choice(doc_data['text_chunks'])
            print("\nSample text chunk:")
            print(sample_chunk[:500] + "..." if len(sample_chunk) > 500 else sample_chunk)
        
        # Display information about visual elements
        if doc_data['visual_elements']:
            print("\nVisual elements:")
            for i, element in enumerate(doc_data['visual_elements']):
                print(f" - Element {i+1}: {element['element_type']} on page {element['page_num']} (confidence: {element['confidence_score']:.2f})")
                
            # Load and display a sample visual element if available
            visuals_dir = sample_doc_dir / "visuals"
            if visuals_dir.exists():
                visual_files = list(visuals_dir.glob("*.png"))
                if visual_files:
                    sample_visual = random.choice(visual_files)
                    print(f"\nDisplaying sample visual element: {sample_visual.name}")
                    
                    from PIL import Image
                    plt.figure(figsize=(10, 8))
                    plt.imshow(Image.open(sample_visual))
                    plt.axis('off')
                    plt.title(sample_visual.name)
                    plt.show()
else:
    print("No processed documents available for exploration.")

## 8. Analyzing Extracted Content

In [None]:
import pandas as pd
import re
from collections import Counter

# Analyze the content of all processed documents (if available)
processed_dirs = [d for d in PROCESSED_DATA_DIR.iterdir() if d.is_dir()]

if processed_dirs:
    # Collect data from all processed documents
    all_chunks = []
    doc_info = []
    
    for doc_dir in processed_dirs:
        json_files = list(doc_dir.glob("*.json"))
        
        for json_file in json_files:
            try:
                with open(json_file, "r", encoding="utf-8") as f:
                    doc_data = json.load(f)
                
                # Add document info
                doc_info.append({
                    "document_id": doc_data["document_id"],
                    "filename": doc_data["filename"],
                    "chunks": len(doc_data["text_chunks"]),
                    "visuals": len(doc_data["visual_elements"]),
                    "pages": doc_data["metadata"].get("page_count", "N/A"),
                })
                
                # Add text chunks
                for chunk in doc_data["text_chunks"]:
                    all_chunks.append({
                        "document_id": doc_data["document_id"],
                        "text": chunk,
                    })
            except Exception as e:
                print(f"Error processing {json_file}: {str(e)}")
    
    # Create dataframes
    doc_df = pd.DataFrame(doc_info)
    chunks_df = pd.DataFrame(all_chunks)
    
    # Display document summary
    print("Document Summary:")
    print(doc_df)
    
    # Analyze text content
    if not chunks_df.empty:
        # Extract key sustainability terms
        sustainability_terms = [
            "sustainability", "climate", "carbon", "emission", "green",
            "renewable", "environment", "social", "governance", "esg",
            "net zero", "diversity", "inclusion", "ethical", "responsibility"
        ]
        
        # Count occurrences of each term
        term_counts = {}
        for term in sustainability_terms:
            pattern = re.compile(r'\b' + re.escape(term) + r'\w*\b', re.IGNORECASE)
            count = sum(chunks_df["text"].str.count(pattern))
            term_counts[term] = count
        
        # Plot term frequencies
        plt.figure(figsize=(12, 6))
        pd.Series(term_counts).sort_values(ascending=False).plot(kind="bar")
        plt.title("Frequency of Key Sustainability Terms")
        plt.xlabel("Term")
        plt.ylabel("Frequency")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        plt.show()
else:
    print("No processed documents available for analysis.")

## 9. Conclusion

In this notebook, we've successfully demonstrated the data ingestion pipeline for the BCG Multi-Agent & Multimodal AI Platform. We've processed BCG Sustainability Reports, extracting both textual content and visual elements, and structured the data for further use in our RAG system.

Next steps include:
1. Creating embeddings for the extracted text chunks
2. Building the vector database for retrieval
3. Implementing the RAG components
4. Developing the multi-agent system