# Populate Pinecone Database with PDF Content

This notebook extracts paragraphs from PDF files, processes them, converts them to embeddings, and uploads them to Pinecone DB.


In [1]:
# Import necessary libraries
import sys
import os
from pathlib import Path
from typing import List
import numpy as np
from unstructured.partition.pdf import partition_pdf
import pinecone
from pinecone import Pinecone, ServerlessSpec
import uuid

# Add src/backend to path
project_root = Path().resolve()
sys.path.append(str(project_root))

from src.backend.query_processing import QueryProcessor
from src.backend.context_retriever import ContextRetriever


  from .autonotebook import tqdm as notebook_tqdm


## Configuration

Set up Pinecone connection and configuration parameters.


In [2]:
# Pinecone configuration
# Set your Pinecone API key as an environment variable or replace with your key
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "your-pinecone-api-key-here")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "pdf-knowledge-base")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT", "us-east-1")  # or your preferred region

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Check if index exists, create if it doesn't
if PINECONE_INDEX_NAME not in pc.list_indexes().names():
    # Create index with dimension 1024 (BAAI/bge-m3 model dimension)
    pc.create_index(
        name=PINECONE_INDEX_NAME,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region=PINECONE_ENVIRONMENT
        )
    )
    print(f"Created new index: {PINECONE_INDEX_NAME}")
else:
    print(f"Index {PINECONE_INDEX_NAME} already exists")

# Connect to the index
index = pc.Index(PINECONE_INDEX_NAME)
print(f"Connected to index: {PINECONE_INDEX_NAME}")


Index test already exists
Connected to index: test


## PDF Processing Functions


In [3]:
def extract_paragraphs_from_pdf(pdf_path: str) -> List[str]:
    """
    Extract paragraphs from a PDF file using unstructured.
    Only extracts text content, no other elements.
    
    Args:
        pdf_path: Path to the PDF file
        
    Returns:
        List of paragraph strings extracted from the PDF
    """
    # Partition PDF and extract only text elements
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",  # High resolution for better text extraction
        infer_table_structure=False,  # We only want text
        extract_images_in_pdf=False,  # We only want text
    )
    
    # Extract text from elements and filter out empty strings
    paragraphs = []
    
    for element in elements:
        # Get text content from element
        if hasattr(element, 'text') and element.text:
            text = element.text.strip()
            if text:
                # If element type is a paragraph or similar, add it
                if hasattr(element, 'category') and element.category == 'NarrativeText':
                    paragraphs.append(text)
                elif text:  # Fallback: add any non-empty text
                    # Check if it's a substantial paragraph (more than just a few words)
                    if len(text.split()) > 5:  # At least 5 words
                        paragraphs.append(text)
    
    # Filter out very short paragraphs (likely headers or noise)
    paragraphs = [p for p in paragraphs if len(p.split()) >= 10]  # At least 10 words
    
    return paragraphs


In [4]:
def chunk_text_with_overlap(text: str, chunk_size: int = 300, overlap: int = 50) -> List[str]:
    """
    Split text into chunks of specified word size with overlap.
    
    Args:
        text: Input text to chunk
        chunk_size: Number of words per chunk (default: 300)
        overlap: Number of overlapping words between consecutive chunks (default: 50)
        
    Returns:
        List of text chunks
    """
    # Split text into words
    words = text.split()
    
    if len(words) <= chunk_size:
        # If text is smaller than chunk size, return as single chunk
        return [text]
    
    chunks = []
    start_idx = 0
    
    while start_idx < len(words):
        # Get chunk of words
        end_idx = min(start_idx + chunk_size, len(words))
        chunk_words = words[start_idx:end_idx]
        chunk_text = ' '.join(chunk_words)
        chunks.append(chunk_text)
        
        # Move start index forward by (chunk_size - overlap) to create overlap
        start_idx += (chunk_size - overlap)
        
        # If we're at the end, break
        if end_idx >= len(words):
            break
    
    return chunks


def chunk_paragraphs(paragraphs: List[str], chunk_size: int = 300, overlap: int = 50) -> List[str]:
    """
    Combine paragraphs and chunk them into segments of specified word size with overlap.
    
    Args:
        paragraphs: List of paragraph strings
        chunk_size: Number of words per chunk (default: 300)
        overlap: Number of overlapping words between consecutive chunks (default: 50)
        
    Returns:
        List of chunked text segments
    """
    # Combine all paragraphs into one continuous text
    combined_text = ' '.join(paragraphs)
    
    # Chunk the combined text
    chunks = chunk_text_with_overlap(combined_text, chunk_size, overlap)
    
    return chunks


In [5]:
def process_paragraphs(paragraphs: List[str], query_processor: QueryProcessor) -> List[str]:
    """
    Process paragraphs using the same algorithm as query processing.
    Applies lowercase conversion and whitespace normalization.
    
    Args:
        paragraphs: List of raw paragraph strings
        query_processor: QueryProcessor instance for processing
        
    Returns:
        List of processed paragraph strings
    """
    processed_paragraphs = []
    for paragraph in paragraphs:
        processed = query_processor.process(paragraph)
        processed_paragraphs.append(processed)
    
    return processed_paragraphs


## Main Function to Populate Database


In [6]:
def populate_pinecone_from_pdf(pdf_path: str, chunk_size: int = 300, overlap: int = 50, batch_size: int = 100):
    """
    Main function to extract paragraphs from PDF, chunk them into segments,
    process them, convert to embeddings, and upload to Pinecone DB.
    
    Args:
        pdf_path: Path to the PDF file
        chunk_size: Number of words per chunk (default: 300)
        overlap: Number of overlapping words between consecutive chunks (default: 50)
        batch_size: Number of chunks to process and upload in each batch (default: 100)
    """
    # Resolve PDF path relative to project root
    if not Path(pdf_path).is_absolute():
        # If relative path, resolve relative to project root
        project_root = Path().resolve()
        pdf_path = project_root / pdf_path
    else:
        pdf_path = Path(pdf_path)
    
    pdf_path = pdf_path.resolve()
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
    
    # Get PDF name for metadata
    pdf_name = pdf_path.stem
    
    print(f"Processing PDF: {pdf_name}")
    print(f"PDF path: {pdf_path}")
    print("-" * 50)
    
    # Step 1: Extract paragraphs from PDF
    print("Step 1: Extracting paragraphs from PDF...")
    paragraphs = extract_paragraphs_from_pdf(str(pdf_path))
    print(f"Extracted {len(paragraphs)} paragraphs")
    
    if len(paragraphs) == 0:
        print("Warning: No paragraphs extracted from PDF. Exiting.")
        return
    
    # Step 2: Chunk paragraphs into segments of specified size with overlap
    print(f"\nStep 2: Chunking paragraphs into {chunk_size}-word segments with {overlap}-word overlap...")
    chunked_segments = chunk_paragraphs(paragraphs, chunk_size=chunk_size, overlap=overlap)
    print(f"Created {len(chunked_segments)} chunks")
    
    # Step 3: Process chunks using QueryProcessor
    print("\nStep 3: Processing chunks...")
    query_processor = QueryProcessor()
    processed_chunks = process_paragraphs(chunked_segments, query_processor)
    print(f"Processed {len(processed_chunks)} chunks")
    
    # Step 4: Convert to embeddings using ContextRetriever
    print("\nStep 4: Converting chunks to embeddings...")
    context_retriever = ContextRetriever()
    
    # Process in batches to avoid memory issues
    total_uploaded = 0
    num_batches = (len(processed_chunks) + batch_size - 1) // batch_size
    
    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, len(processed_chunks))
        batch_chunks = processed_chunks[start_idx:end_idx]
        batch_original = chunked_segments[start_idx:end_idx]  # Keep original for storage
        
        print(f"Processing batch {batch_idx + 1}/{num_batches} ({len(batch_chunks)} chunks)...")
        
        # Convert batch to embeddings
        embeddings = context_retriever.convert_batch_to_embeddings(batch_chunks)
        
        # Ensure embeddings is a 2D numpy array
        if embeddings.ndim == 1:
            embeddings = embeddings.reshape(1, -1)
        
        # Step 5: Prepare vectors for Pinecone upload
        vectors_to_upload = []
        for i in range(len(batch_chunks)):
            # Get embedding for this chunk (handle both 1D and 2D arrays)
            if embeddings.ndim == 2:
                embedding = embeddings[i]
            else:
                embedding = embeddings
            
            # Generate unique ID for each vector
            vector_id = str(uuid.uuid4())
            
            # Prepare metadata
            metadata = {
                "pdf_name": pdf_name,
                "chunk_index": start_idx + i,
                "text": batch_original[i],  # Store original text for retrieval
                "processed_text": batch_chunks[i]  # Store processed text for reference
            }
            
            vectors_to_upload.append({
                "id": vector_id,
                "values": embedding.tolist(),  # Convert numpy array to list
                "metadata": metadata
            })
        
        # Upload batch to Pinecone
        print(f"Uploading batch {batch_idx + 1} to Pinecone...")
        index.upsert(vectors=vectors_to_upload)
        total_uploaded += len(vectors_to_upload)
        print(f"Uploaded {len(vectors_to_upload)} vectors (Total: {total_uploaded})")
    
    print("\n" + "=" * 50)
    print(f"Successfully uploaded {total_uploaded} vectors to Pinecone!")
    print(f"PDF: {pdf_name}")
    print(f"Chunk size: {chunk_size} words, Overlap: {overlap} words")
    print("=" * 50)


## Usage Example

Run the function with the path to your PDF file.


In [7]:
# Example usage:
# Replace with the path to your PDF file
# Path is relative to the project root
pdf_file_path = "test_data/takamuku2009.pdf"

# Populate Pinecone database
# Chunks will be 300 words with 50-word overlap
populate_pinecone_from_pdf(pdf_file_path, chunk_size=300, overlap=50, batch_size=100)


Processing PDF: takamuku2009
PDF path: /Users/user/Desktop/pdf-knowledge-assistant/test_data/takamuku2009.pdf
--------------------------------------------------
Step 1: Extracting paragraphs from PDF...
Extracted 136 paragraphs

Step 2: Chunking paragraphs into 300-word segments with 50-word overlap...
Created 26 chunks

Step 3: Processing chunks...
Processed 26 chunks

Step 4: Converting chunks to embeddings...
Processing batch 1/1 (26 chunks)...


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Uploading batch 1 to Pinecone...
Uploaded 26 vectors (Total: 26)

Successfully uploaded 26 vectors to Pinecone!
PDF: takamuku2009
Chunk size: 300 words, Overlap: 50 words
