# PDF Ingestion Pipeline with FAISS Vector Database

This notebook processes PDF files from the `annual_reports` folder and creates a persistent FAISS index with support for both local models and HuggingFace inference endpoints.

## ⚠️ Quick Start for Kernel Crash Issues

If you're experiencing kernel crashes:
1. First run the NumPy downgrade: `!pip install 'numpy<2' --force-reinstall`
2. Set environment variables (see cell below)
3. Use `use_safe_mode=True` when running the pipeline
4. Reduce batch size or use the debug cells to identify issues

In [1]:
# IMPORTANT: Run this cell FIRST if experiencing crashes
import os

# Set environment variables for stability
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

print("✅ Environment configured for stability")

✅ Environment configured for stability


## 1. Install Required Dependencies

**Important**: If you encounter NumPy compatibility errors, run the NumPy downgrade command first!

In [2]:
# Check current NumPy version
import numpy as np
print(f"Current NumPy version: {np.__version__}")

# If you see version 2.x and get compatibility errors, uncomment the next line:
# !pip install 'numpy<2' --force-reinstall

Current NumPy version: 1.26.4


In [3]:
# Option 1: Quick fix for NumPy compatibility (Recommended)
# !pip install 'numpy<2' --force-reinstall

# Option 2: Complete installation with compatible versions
# !pip install 'numpy<2' faiss-cpu==1.7.4 'sentence-transformers>=2.2.0' 'transformers>=4.30.0' 'langchain>=0.0.200' PyPDF2 pdfplumber huggingface_hub python-dotenv tiktoken

# Option 3: If you still have issues, create a fresh environment:
# python -m venv pdf_rag_env
# source pdf_rag_env/bin/activate  # On Windows: pdf_rag_env\\Scripts\\activate
# pip install 'numpy<2' faiss-cpu sentence-transformers langchain PyPDF2 pdfplumber huggingface_hub

## 2. Environment Check and Import Libraries

In [4]:
# Quick environment check
import sys
print(f"Python version: {sys.version}")
print(f"Virtual environment: {sys.prefix}")

# Test critical imports
try:
    import numpy as np
    print(f"✅ NumPy {np.__version__}")
except ImportError as e:
    print(f"❌ NumPy import failed: {e}")

Python version: 3.11.4 (main, Jul 25 2023, 17:07:07) [Clang 14.0.3 (clang-1403.0.22.14.1)]
Virtual environment: /Users/raamraam/outskill/GenAIEngineering-Cohort1/crewai/crewai_env
✅ NumPy 1.26.4


In [5]:
import os
import json
import pickle
from typing import List, Dict, Optional, Union
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# PDF processing
import PyPDF2
import pdfplumber

# Text processing
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

# Embeddings
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
from huggingface_hub import InferenceClient

# Vector store
import faiss
import numpy as np

# Utils
from datetime import datetime
from tqdm import tqdm
import hashlib

print("✅ Libraries imported successfully")

✅ Libraries imported successfully


## 3. Configuration

⚠️ **Memory Warning**: If processing large PDFs (>10MB) or many files, consider:
- Reducing `CHUNK_SIZE` to 500
- Processing files in batches
- Using `use_safe_mode=True` in the pipeline

In [6]:
class Config:
    """Configuration for the ingestion pipeline"""
    # Paths
    PDF_FOLDER = "annual_reports"
    FAISS_INDEX_PATH = "faiss_index"
    METADATA_PATH = "document_metadata.json"
    CHUNKS_PATH = "document_chunks.pkl"
    
    # Chunking parameters
    CHUNK_SIZE = 1000
    CHUNK_OVERLAP = 200
    
    # Embedding models
    DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
    HF_API_EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
    
    # HuggingFace API (set your token as environment variable HF_TOKEN)
    HF_TOKEN = os.getenv("HF_TOKEN")
    
    # FAISS parameters
    EMBEDDING_DIM = 384  # Adjust based on your model
    
config = Config()

# Create necessary directories
os.makedirs(config.PDF_FOLDER, exist_ok=True)
print(f"📁 Configuration loaded. PDF folder: {config.PDF_FOLDER}")

📁 Configuration loaded. PDF folder: annual_reports


## 4. PDF Extraction Functions

In [7]:
def extract_text_pypdf2(pdf_path: str) -> str:
    """Extract text from PDF using PyPDF2"""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error with PyPDF2: {e}")
    return text

def extract_text_pdfplumber(pdf_path: str) -> str:
    """Extract text from PDF using pdfplumber (better for tables)"""
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error with pdfplumber: {e}")
    return text

def extract_pdf_text(pdf_path: str, method: str = "pdfplumber") -> Dict:
    """Extract text and metadata from PDF"""
    if method == "pypdf2":
        text = extract_text_pypdf2(pdf_path)
    else:
        text = extract_text_pdfplumber(pdf_path)
    
    # Generate document ID
    doc_id = hashlib.md5(pdf_path.encode()).hexdigest()
    
    return {
        "doc_id": doc_id,
        "path": pdf_path,
        "filename": os.path.basename(pdf_path),
        "text": text,
        "extraction_method": method,
        "extraction_date": datetime.now().isoformat()
    }

print("✅ PDF extraction functions defined")

✅ PDF extraction functions defined


## 5. Text Chunking

In [8]:
def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
    """Split text into chunks using RecursiveCharacterTextSplitter"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    
    chunks = text_splitter.split_text(text)
    return chunks

def create_document_chunks(documents: List[Dict]) -> List[Dict]:
    """Create chunks from documents with metadata"""
    all_chunks = []
    
    for doc in documents:
        chunks = chunk_text(
            doc["text"], 
            chunk_size=config.CHUNK_SIZE, 
            chunk_overlap=config.CHUNK_OVERLAP
        )
        
        for i, chunk in enumerate(chunks):
            chunk_data = {
                "chunk_id": f"{doc['doc_id']}_{i}",
                "doc_id": doc["doc_id"],
                "filename": doc["filename"],
                "chunk_index": i,
                "text": chunk,
                "char_count": len(chunk)
            }
            all_chunks.append(chunk_data)
    
    return all_chunks

print("✅ Text chunking functions defined")

✅ Text chunking functions defined


## 5.5. Debug Embedding Issues (Optional)

If you're experiencing kernel crashes during embedding generation, run these debug cells:

In [9]:
# Debug cell - Check system resources
try:
    import psutil
except ImportError:
    print("Installing psutil for system monitoring...")
    !pip install psutil
    import psutil

import torch

# Check available memory
memory = psutil.virtual_memory()
print(f"Available memory: {memory.available / 1024**3:.2f} GB")
print(f"Total memory: {memory.total / 1024**3:.2f} GB")
print(f"Memory usage: {memory.percent}%")

# Check PyTorch and CUDA
print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

Available memory: 0.67 GB
Total memory: 16.00 GB
Memory usage: 95.8%

PyTorch version: 2.2.2
CUDA available: False


In [10]:
# Alternative: Simple embedding generator for debugging
def test_embedding_generation():
    """Test embedding generation with a small sample"""
    from sentence_transformers import SentenceTransformer
    import torch
    
    # Force CPU usage
    device = 'cpu'
    torch.set_num_threads(1)  # Limit threads to avoid conflicts
    
    # Test with a tiny model first
    test_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
    
    # Test with small text
    test_texts = ["This is a test sentence.", "Another test."]
    
    try:
        embeddings = test_model.encode(test_texts, batch_size=1)
        print(f"✅ Test successful! Embeddings shape: {embeddings.shape}")
        return True
    except Exception as e:
        print(f"❌ Test failed: {e}")
        return False

# Run the test
test_embedding_generation()

✅ Test successful! Embeddings shape: (2, 384)


True

In [11]:
# Alternative embedding function with maximum stability
def generate_embeddings_safe(texts: List[str], model_name: str = 'all-MiniLM-L6-v2'):
    """Generate embeddings with maximum stability settings"""
    import torch
    from sentence_transformers import SentenceTransformer
    import gc
    
    # Configure for stability
    torch.set_num_threads(1)
    torch.set_grad_enabled(False)
    
    # Load model on CPU
    model = SentenceTransformer(model_name, device='cpu')
    model.eval()
    
    embeddings = []
    batch_size = 4  # Very small batch size
    
    print(f"Processing {len(texts)} texts in batches of {batch_size}...")
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        
        # Process batch
        with torch.no_grad():
            batch_embeddings = model.encode(
                batch,
                batch_size=batch_size,
                show_progress_bar=False,
                convert_to_numpy=True,
                normalize_embeddings=False
            )
        
        embeddings.extend(batch_embeddings)
        
        # Aggressive memory cleanup
        if i % 20 == 0:
            gc.collect()
    
    return np.array(embeddings)

print("✅ Safe embedding function defined")

✅ Safe embedding function defined


In [12]:
class EmbeddingGenerator:
    def __init__(self, method: str = "local", model_name: str = None):
        """
        Initialize embedding generator
        Args:
            method: 'local' or 'hf_api'
            model_name: Model to use for embeddings
        """
        self.method = method
        self.model_name = model_name or config.DEFAULT_EMBEDDING_MODEL
        
        if method == "local":
            print(f"Loading local model: {self.model_name}")
            # Set device to CPU to avoid CUDA issues
            import torch
            device = 'cpu'  # Force CPU to avoid GPU memory issues
            print(f"Using device: {device}")
            
            self.model = SentenceTransformer(self.model_name, device=device)
            self.embedding_dim = self.model.get_sentence_embedding_dimension()
            
            # Disable gradient computation to save memory
            self.model.eval()
            torch.set_grad_enabled(False)
            
        elif method == "hf_api":
            if not config.HF_TOKEN:
                raise ValueError("HF_TOKEN not found. Set it as environment variable.")
            self.client = InferenceClient(token=config.HF_TOKEN)
            self.model_name = config.HF_API_EMBEDDING_MODEL
            # You'll need to know the embedding dimension for your API model
            self.embedding_dim = 768  # for all-mpnet-base-v2
    
    def generate_embeddings(self, texts: List[str], batch_size: int = 8) -> np.ndarray:
        """Generate embeddings for a list of texts"""
        if self.method == "local":
            return self._generate_local_embeddings(texts, batch_size)
        elif self.method == "hf_api":
            return self._generate_api_embeddings(texts)
    
    def _generate_local_embeddings(self, texts: List[str], batch_size: int) -> np.ndarray:
        """Generate embeddings using local model with memory management"""
        embeddings = []
        
        # Process in smaller batches to avoid memory issues
        for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
            batch = texts[i:i + batch_size]
            
            try:
                # Generate embeddings with error handling
                batch_embeddings = self.model.encode(
                    batch, 
                    convert_to_numpy=True,
                    show_progress_bar=False,  # Disable nested progress bar
                    batch_size=batch_size
                )
                embeddings.extend(batch_embeddings)
                
                # Clear memory periodically
                if i % (batch_size * 10) == 0:
                    import gc
                    gc.collect()
                    
            except Exception as e:
                print(f"Error in batch {i//batch_size}: {e}")
                # Create zero embeddings for failed batch
                for _ in batch:
                    embeddings.append(np.zeros(self.embedding_dim))
        
        return np.array(embeddings)
    
    def _generate_api_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embeddings using HuggingFace API"""
        embeddings = []
        
        for text in tqdm(texts, desc="Generating embeddings via API"):
            try:
                # Using feature extraction endpoint
                embedding = self.client.feature_extraction(
                    text,
                    model=self.model_name
                )
                embeddings.append(embedding)
            except Exception as e:
                print(f"Error generating embedding: {e}")
                # Fallback to zero vector
                embeddings.append(np.zeros(self.embedding_dim))
        
        return np.array(embeddings)

print("✅ Embedding generator class defined with memory optimization")

✅ Embedding generator class defined with memory optimization


## 7. FAISS Index Creation and Management

In [13]:
class FAISSIndexManager:
    def __init__(self, embedding_dim: int):
        self.embedding_dim = embedding_dim
        self.index = None
        self.chunks = []
        self.metadata = {}
    
    def create_index(self, embeddings: np.ndarray, chunks: List[Dict]):
        """Create FAISS index from embeddings"""
        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(embeddings)
        
        # Create FAISS index
        self.index = faiss.IndexFlatIP(self.embedding_dim)  # Inner product for cosine similarity
        self.index.add(embeddings)
        
        self.chunks = chunks
        self.metadata = {
            "total_chunks": len(chunks),
            "embedding_dim": self.embedding_dim,
            "creation_date": datetime.now().isoformat(),
            "documents": list(set(chunk["filename"] for chunk in chunks))
        }
        
        print(f"✅ Created FAISS index with {len(chunks)} chunks")
    
    def save_index(self, base_path: str = None):
        """Save FAISS index and associated data"""
        base_path = base_path or config.FAISS_INDEX_PATH
        os.makedirs(base_path, exist_ok=True)
        
        # Save FAISS index
        faiss.write_index(self.index, os.path.join(base_path, "index.faiss"))
        
        # Save chunks
        with open(os.path.join(base_path, "chunks.pkl"), "wb") as f:
            pickle.dump(self.chunks, f)
        
        # Save metadata
        with open(os.path.join(base_path, "metadata.json"), "w") as f:
            json.dump(self.metadata, f, indent=2)
        
        print(f"💾 Saved index to {base_path}")
    
    def load_index(self, base_path: str = None):
        """Load FAISS index and associated data"""
        base_path = base_path or config.FAISS_INDEX_PATH
        
        # Load FAISS index
        self.index = faiss.read_index(os.path.join(base_path, "index.faiss"))
        
        # Load chunks
        with open(os.path.join(base_path, "chunks.pkl"), "rb") as f:
            self.chunks = pickle.load(f)
        
        # Load metadata
        with open(os.path.join(base_path, "metadata.json"), "r") as f:
            self.metadata = json.load(f)
        
        print(f"📂 Loaded index from {base_path}")
        print(f"   Total chunks: {self.metadata['total_chunks']}")
    
    def search(self, query_embedding: np.ndarray, k: int = 5) -> List[Dict]:
        """Search for similar chunks"""
        # Normalize query embedding
        query_embedding = query_embedding.reshape(1, -1)
        faiss.normalize_L2(query_embedding)
        
        # Search
        distances, indices = self.index.search(query_embedding, k)
        
        # Prepare results
        results = []
        for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
            if idx < len(self.chunks):  # Valid index
                result = {
                    "rank": i + 1,
                    "score": float(dist),
                    "chunk": self.chunks[idx]
                }
                results.append(result)
        
        return results

print("✅ FAISS index manager class defined")

✅ FAISS index manager class defined


## 8. Main Pipeline Function

In [14]:
def run_ingestion_pipeline(
    pdf_folder: str = None,
    embedding_method: str = "local",
    save_path: str = None,
    use_safe_mode: bool = False
):
    """
    Run the complete ingestion pipeline
    Args:
        pdf_folder: Folder containing PDF files
        embedding_method: 'local' or 'hf_api'
        save_path: Path to save the FAISS index
        use_safe_mode: Use safer embedding generation (slower but more stable)
    """
    pdf_folder = pdf_folder or config.PDF_FOLDER
    save_path = save_path or config.FAISS_INDEX_PATH
    
    print(f"🚀 Starting ingestion pipeline...")
    print(f"   PDF folder: {pdf_folder}")
    print(f"   Embedding method: {embedding_method}")
    print(f"   Safe mode: {use_safe_mode}")
    
    # Step 1: Extract text from PDFs
    print("\n📄 Step 1: Extracting text from PDFs...")
    documents = []
    
    pdf_files = list(Path(pdf_folder).glob("*.pdf"))
    if not pdf_files:
        raise ValueError(f"No PDF files found in {pdf_folder}")
    
    for pdf_path in pdf_files:
        print(f"   Processing: {pdf_path.name}")
        doc_data = extract_pdf_text(str(pdf_path))
        documents.append(doc_data)
    
    # Step 2: Create chunks
    print("\n✂️  Step 2: Creating document chunks...")
    chunks = create_document_chunks(documents)
    print(f"   Created {len(chunks)} chunks from {len(documents)} documents")
    
    # Step 3: Generate embeddings
    print("\n🧮 Step 3: Generating embeddings...")
    chunk_texts = [chunk["text"] for chunk in chunks]
    
    if use_safe_mode:
        print("   Using safe mode for embedding generation...")
        embeddings = generate_embeddings_safe(chunk_texts)
    else:
        embedding_gen = EmbeddingGenerator(method=embedding_method)
        embeddings = embedding_gen.generate_embeddings(chunk_texts, batch_size=8)
    
    print(f"   Generated embeddings with shape: {embeddings.shape}")
    
    # Step 4: Create and save FAISS index
    print("\n🗂️  Step 4: Creating FAISS index...")
    index_manager = FAISSIndexManager(embedding_dim=embeddings.shape[1])
    index_manager.create_index(embeddings, chunks)
    
    # Step 5: Save everything
    print("\n💾 Step 5: Saving index and metadata...")
    index_manager.save_index(save_path)
    
    print("\n✅ Pipeline completed successfully!")
    return index_manager

## 9. Run the Pipeline

Make sure you have PDF files in the `annual_reports` folder before running this cell.

In [15]:
# Option 1: Run with normal mode (may crash on some systems)
# index_manager = run_ingestion_pipeline(
#     pdf_folder="annual_reports",
#     embedding_method="local",
#     save_path="faiss_index"
# )

# Option 2: Run with safe mode (recommended if kernel crashes)
index_manager = run_ingestion_pipeline(
    pdf_folder="annual_reports",
    embedding_method="local",
    save_path="faiss_index",
    use_safe_mode=True  # Enable safe mode
)

🚀 Starting ingestion pipeline...
   PDF folder: annual_reports
   Embedding method: local
   Safe mode: True

📄 Step 1: Extracting text from PDFs...
   Processing: NASDAQ_MSFT_2023.pdf
   Processing: NASDAQ_AAPL_2023.pdf

✂️  Step 2: Creating document chunks...
   Created 682 chunks from 2 documents

🧮 Step 3: Generating embeddings...
   Using safe mode for embedding generation...
Processing 682 texts in batches of 4...


100%|██████████| 171/171 [02:25<00:00,  1.18it/s]

   Generated embeddings with shape: (682, 384)

🗂️  Step 4: Creating FAISS index...
✅ Created FAISS index with 682 chunks

💾 Step 5: Saving index and metadata...
💾 Saved index to faiss_index

✅ Pipeline completed successfully!





## 9.5. Troubleshooting Kernel Crashes

If you're still experiencing kernel crashes, try these solutions:

In [16]:
# Solution 1: Install/reinstall critical packages with specific versions
# !pip install --upgrade --force-reinstall 'numpy<2' torch==2.0.1 sentence-transformers==2.2.2

# Solution 2: Set environment variables before importing
import os
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Solution 3: Clear all variables and restart
# %reset -f
# Then restart kernel and run cells again

# Solution 4: As a last resort, restart Jupyter completely
# Close this notebook, restart Jupyter server, and try again

print("Environment variables set for stability")

Environment variables set for stability


In [17]:
# Alternative: Process PDFs one at a time to isolate issues
def process_single_pdf(pdf_path: str, model_name: str = 'all-MiniLM-L6-v2'):
    """Process a single PDF file - useful for debugging"""
    print(f"Processing: {pdf_path}")
    
    # Extract text
    doc_data = extract_pdf_text(pdf_path)
    print(f"  Extracted {len(doc_data['text'])} characters")
    
    # Create chunks
    chunks = create_document_chunks([doc_data])
    print(f"  Created {len(chunks)} chunks")
    
    # Generate embeddings safely
    chunk_texts = [chunk["text"] for chunk in chunks]
    embeddings = generate_embeddings_safe(chunk_texts, model_name)
    print(f"  Generated embeddings: {embeddings.shape}")
    
    return embeddings, chunks

# Test with a single PDF
# pdf_files = list(Path("annual_reports").glob("*.pdf"))
# if pdf_files:
#     embeddings, chunks = process_single_pdf(str(pdf_files[0]))

In [18]:
def quick_search(query: str, k: int = 5, embedding_method: str = "local"):
    """Quick search function after index is created"""
    # Load existing index
    index_manager = FAISSIndexManager(embedding_dim=384)  # Adjust based on your model
    index_manager.load_index("faiss_index")
    
    # Generate query embedding
    embedding_gen = EmbeddingGenerator(method=embedding_method)
    query_embedding = embedding_gen.generate_embeddings([query])
    
    # Search
    results = index_manager.search(query_embedding[0], k=k)
    
    # Display results
    print(f"\n🔍 Query: {query}")
    print(f"📊 Top {k} results:")
    for result in results:
        print(f"\n[Rank {result['rank']}] Score: {result['score']:.4f}")
        print(f"📄 Document: {result['chunk']['filename']}")
        print(f"📝 Text: {result['chunk']['text'][:200]}...")
    
    return results

## 11. Example Search

In [19]:
# Example search - modify the query based on your documents
results = quick_search("revenue growth", k=3)

📂 Loaded index from faiss_index
   Total chunks: 682
Loading local model: sentence-transformers/all-MiniLM-L6-v2
Using device: cpu


Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.93it/s]


🔍 Query: revenue growth
📊 Top 3 results:

[Rank 1] Score: 0.5754
📄 Document: NASDAQ_MSFT_2023.pdf
📝 Text: and fourth quarter revenue is driven by the volume of multi-year on-premises contracts executed during the period.
Change in Accounting Estimate
In July 2022, we completed an assessment of the useful ...

[Rank 2] Score: 0.5710
📄 Document: NASDAQ_MSFT_2023.pdf
📝 Text: SEGMENT RESULTS OF OPERATIONS
Percentage
(In millions, except percentages) 2023 2022 Change
R evenue
P roductivity and Business Processes $ 69,274 $ 63,364 9%
Intelligent Cloud 87,907 74,965 17%
More ...

[Rank 3] Score: 0.5507
📄 Document: NASDAQ_MSFT_2023.pdf
📝 Text: in accounting estimate, gross margin percentage increased slightly driven by improvement in Office 365
Commercial, offset in part by sales mix shift to cloud offerings.
• Operating expenses increased ...





## 12. Utility Functions

In [20]:
def update_index_with_new_pdfs(new_pdf_paths: List[str], existing_index_path: str = "faiss_index"):
    """Add new PDFs to existing index"""
    # Load existing index and data
    index_manager = FAISSIndexManager(embedding_dim=384)
    index_manager.load_index(existing_index_path)
    
    # Process new PDFs
    new_documents = []
    for pdf_path in new_pdf_paths:
        doc_data = extract_pdf_text(pdf_path)
        new_documents.append(doc_data)
    
    # Create chunks for new documents
    new_chunks = create_document_chunks(new_documents)
    
    # Generate embeddings
    embedding_gen = EmbeddingGenerator(method="local")
    chunk_texts = [chunk["text"] for chunk in new_chunks]
    new_embeddings = embedding_gen.generate_embeddings(chunk_texts)
    
    # Normalize and add to index
    faiss.normalize_L2(new_embeddings)
    index_manager.index.add(new_embeddings)
    
    # Update chunks and metadata
    index_manager.chunks.extend(new_chunks)
    index_manager.metadata["total_chunks"] = len(index_manager.chunks)
    index_manager.metadata["documents"].extend([chunk["filename"] for chunk in new_chunks])
    index_manager.metadata["last_updated"] = datetime.now().isoformat()
    
    # Save updated index
    index_manager.save_index(existing_index_path)
    
    print(f"✅ Added {len(new_chunks)} chunks from {len(new_documents)} new documents")

def get_index_stats(index_path: str = "faiss_index"):
    """Get statistics about the index"""
    with open(os.path.join(index_path, "metadata.json"), "r") as f:
        metadata = json.load(f)
    
    print("📊 FAISS Index Statistics:")
    print(f"   - Total chunks: {metadata['total_chunks']}")
    print(f"   - Embedding dimension: {metadata['embedding_dim']}")
    print(f"   - Documents indexed: {len(metadata['documents'])}")
    print(f"   - Creation date: {metadata['creation_date']}")
    print(f"   - Documents: {', '.join(metadata['documents'])}")
    
    return metadata

## 13. Get Index Statistics

In [21]:
# Get statistics about your index
stats = get_index_stats()

📊 FAISS Index Statistics:
   - Total chunks: 682
   - Embedding dimension: 384
   - Documents indexed: 2
   - Creation date: 2025-07-11T12:20:20.038881
   - Documents: NASDAQ_AAPL_2023.pdf, NASDAQ_MSFT_2023.pdf


## 16. Alternative: Minimal Processing (If All Else Fails)

If you're still experiencing crashes, here's a minimal approach that processes one file at a time:

In [22]:
# Minimal pipeline for problematic systems
def minimal_pipeline():
    """Ultra-safe minimal pipeline"""
    import gc
    from sentence_transformers import SentenceTransformer
    
    # Use the lightest model
    model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
    
    # Process each PDF separately
    pdf_files = list(Path("annual_reports").glob("*.pdf"))
    all_embeddings = []
    all_chunks = []
    
    for pdf_file in pdf_files:
        print(f"\nProcessing {pdf_file.name}...")
        
        # Extract text
        doc = extract_pdf_text(str(pdf_file), method="pypdf2")
        
        # Simple chunking
        text = doc['text']
        chunk_size = 500
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size-50)]
        
        # Process in tiny batches
        for i in range(0, len(chunks), 2):
            batch = chunks[i:i+2]
            embeddings = model.encode(batch, convert_to_numpy=True)
            all_embeddings.extend(embeddings)
            
            # Create chunk metadata
            for j, chunk_text in enumerate(batch):
                all_chunks.append({
                    "text": chunk_text,
                    "filename": pdf_file.name,
                    "chunk_index": i + j
                })
        
        # Aggressive cleanup
        gc.collect()
        print(f"  Processed {len(chunks)} chunks")
    
    # Create simple FAISS index
    embeddings_array = np.array(all_embeddings)
    index = faiss.IndexFlatL2(embeddings_array.shape[1])
    index.add(embeddings_array)
    
    # Save
    os.makedirs("faiss_index_minimal", exist_ok=True)
    faiss.write_index(index, "faiss_index_minimal/index.faiss")
    
    with open("faiss_index_minimal/chunks.pkl", "wb") as f:
        pickle.dump(all_chunks, f)
    
    print(f"\n✅ Minimal pipeline complete. Processed {len(all_chunks)} chunks.")
    return index, all_chunks

# Run minimal pipeline
# index, chunks = minimal_pipeline()