# PDF Content Processor for Azure AI Search

This notebook provides tools for processing PDF documents and preparing them for Azure AI Search. It includes functionality for:
- PDF text extraction using PyMuPDF
- Content chunking and processing
- Embedding generation using Azure OpenAI
- Azure AI Search index creation and population

## Setup and Dependencies
First, let's import all required libraries and set up our environment.

In [1]:
# Core libraries
import os
import re
import json
import yaml
import dotenv
import tqdm
import fitz  # PyMuPDF

# Azure and OpenAI related imports
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes.models import (
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

# LangChain imports
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, AzureOpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores.azuresearch import AzureSearch

# Load environment variables
dotenv.load_dotenv()

True

## Configuration
Set up the paths and configuration parameters for document processing.

In [2]:
# Directory configuration
BASE_DIR = "../doc/Temario"
PDF_DIR = os.path.join(BASE_DIR, "temas_por_secciones")
OUTPUT_DIR = os.path.join(BASE_DIR, "../data/output")
METADATA_FILE = os.path.join(PDF_DIR, "document_label.yaml")

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load metadata from YAML file
with open(METADATA_FILE, 'r') as file:
    metadata = yaml.safe_load(file)

## PDF Processing Functions
Core functions for extracting and processing content from PDF files.

In [3]:
def extract_text_from_pdfs(pdf_dir: str, metadata: dict) -> list:
    """
    Extract text content from PDF files in the specified directory.
    
    Args:
        pdf_dir (str): Directory containing PDF files
        metadata (dict): Dictionary containing metadata for each PDF
    
    Returns:
        list: List of dictionaries containing extracted text and metadata for each page
    """
    documents = []
    
    for filename in os.listdir(pdf_dir):
        if not filename.endswith(".pdf"):
            continue
            
        path = os.path.join(pdf_dir, filename)
        doc = fitz.open(path)
        
        # Get metadata for the current file
        metadata_key = filename.split('.')[0]
        metadata_info = metadata.get(metadata_key, {})
        skills = metadata_info.get('skills', [])
        if not isinstance(skills, list):
            skills = [skills]
            
        subject = metadata_info.get('subject', 'Unknown Subject')
        difficulty = metadata_info.get('difficulty', 'Unknown Difficulty')
        description = metadata_info.get('description', 'No Description')

        # Process each page
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            
            # Extract chapter information from first page
            chapter_number = None
            chapter_title = None
            if page_num == 0:
                match_page = re.search(r"\b(\d{3})\b", text)
                match_title = re.search(r"C\nH\nA\nP\nT\nE\nR\n(.*?)\nCONTENTS", text, re.DOTALL)
                
                chapter_number = int(match_page.group(1)) if match_page else None
                chapter_title = match_title.group(1).strip() if match_title else None

            # Create document entry
            documents.append({
                'filename': filename,
                'page_number': chapter_number + page_num if chapter_number else page_num + 1,
                'text': text,
                'skills': skills,
                'subject': subject,
                'difficulty': difficulty,
                'description': description,
                'chapter_title': chapter_title
            })
            
    return documents

## Document Processing and Chunking
Process the extracted documents and split them into manageable chunks.

In [4]:
def process_documents(pages: list) -> tuple:
    """
    Convert raw pages into LangChain documents and split them into chunks.
    
    Args:
        pages (list): List of page dictionaries from PDF processing
    
    Returns:
        tuple: (LangChain documents, chunks)
    """
    # Convert pages to LangChain documents
    docs = []
    for page in pages:
        docs.append(Document(
            page_content=page["text"], 
            metadata={
                "page_number": str(page["page_number"]), 
                "filename": str(page["filename"]),
                "skills": page["skills"],
                "subject": page["subject"],
                "difficulty": page["difficulty"],
                "description": page["description"],
                "chapter_title": page["chapter_title"]
            }
        ))
    
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1300,
        chunk_overlap=100,
        add_start_index=True
    )
    chunks = text_splitter.split_documents(docs)
    
    return docs, chunks

## Azure AI Search Setup
Configure and create the Azure AI Search index.

In [7]:
def create_search_index_schema(embedding_dimension: int) -> list:
    """
    Create the schema for Azure AI Search index.
    
    Args:
        embedding_dimension (int): Dimension of the embedding vectors
    
    Returns:
        list: List of field definitions for the search index
    """
    return [
        SimpleField(
            name="id",
            type=SearchFieldDataType.String,
            key=True,
            filterable=True,
        ),
        SearchableField(
            name="content", 
            type=SearchFieldDataType.String,
            searchable=True
        ),
        SearchableField(name="metadata", 
                        type=SearchFieldDataType.String, 
                        searchable=True),
        SearchField(
            name="content_vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=embedding_dimension,
            vector_search_profile_name="myHnswProfile"
        ),
        SimpleField(
            name="page_number",
            type=SearchFieldDataType.Int32,
            filterable=True,
            facetable=True,
            sortable=True
        ),
        SearchableField(
            name="skills",
            type=SearchFieldDataType.String,
            searchable=True,
            collection=True
        ),
        SearchableField(
            name="subject",
            type=SearchFieldDataType.String,
            searchable=True
        ),
        SearchableField(
            name="difficulty",
            type=SearchFieldDataType.String,
            searchable=True
        ),
        SearchableField(
            name="description",
            type=SearchFieldDataType.String,
            searchable=True
        ),
        SearchableField(
            name="filename",
            type=SearchFieldDataType.String,
            searchable=True
        ),
        SimpleField(
            name="start_index",
            type=SearchFieldDataType.Int32,
            searchable=True
        ),
    ]

## Main Processing Pipeline
Execute the complete document processing pipeline.

In [None]:
def main():
    """Main processing pipeline for PDF documents."""
    # Initialize Azure OpenAI embeddings
    embeddings = AzureOpenAIEmbeddings(
        api_key=os.getenv("AZURE_OPENAI_KEY"),
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        azure_deployment=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
        openai_api_version="2024-02-01"
    )
    
    # Extract text from PDFs
    print("Extracting text from PDFs...")
    pages = extract_text_from_pdfs(PDF_DIR, metadata)
    
    # Process documents and create chunks
    print("Processing documents and creating chunks...")
    docs, chunks = process_documents(pages)
    
    # Generate embeddings for each page
    #print("Generating embeddings...")
    #for page in tqdm.tqdm(pages):
    #    page["embedding"] = embeddings.embed_query(page["text"])
    
    
    # Create and populate Azure AI Search index
    print("Creating and populating Azure AI Search index...")
    index_name = "temario-index-v1"
    vector_store = AzureSearch(
        embedding_function=embeddings.embed_query,
        azure_search_endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
        azure_search_key=os.getenv("AZURE_SEARCH_KEY"),
        index_name=index_name,
        additional_search_client_options={"retry_total": 3},
        fields=create_search_index_schema(len(embeddings.embed_query("test")))
    )
    
    # Add chunks to vector store in batches
    batch_size = 500
    for i in tqdm.tqdm(range(0, len(chunks), batch_size)):
        content_batch = [chunk.page_content for chunk in chunks[i:i + batch_size]]
        metadata_batch = [chunk.metadata for chunk in chunks[i:i + batch_size]]
        vector_store.add_texts(texts=content_batch, metadatas=metadata_batch)
    
    print("Processing complete!")

# Execute the main pipeline
if __name__ == "__main__":
    main()

Extracting text from PDFs...
Processing documents and creating chunks...
Creating and populating Azure AI Search index...


 50%|█████     | 1/2 [01:08<01:08, 68.17s/it]

## Testing the Search Index
Test the created search index with a sample query.

In [None]:
def test_search(query: str, vector_store: AzureSearch):
    """
    Test the search index with a sample query.
    
    Args:
        query (str): Search query
        vector_store (AzureSearch): Initialized Azure Search instance
    """
    results = vector_store.similarity_search(query)
    
    print(f"Search results for query: '{query}'\n")
    for i, result in enumerate(results, 1):
        print(f"Result {i}:")
        print(f"Content: {result.page_content[:200]}...")
        print(f"Page Number: {result.metadata['page_number']}")
        print(f"Chapter Title: {result.metadata.get('chapter_title', 'N/A')}")
        print("-" * 80)

# Example search
test_search("What is the main topic of the document?", vector_store)