# PDFs analyzer
Tool for extracting content from PDFs and format it to be used in Azure AI Search

## Using Azure Document Intelligence


In [1]:
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
import re
import os
import dotenv
import tqdm

In [2]:
# Get all PDFs in data directory
data_dir = "../doc/Temario/"
document_paths = []
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".pdf"):
            document_paths.append(os.path.join(root, file))
print(f"Found {len(document_paths)} documents in {data_dir}")

Found 32 documents in ../doc/Temario/


In [3]:
def extract_info_from_result(result):
    # Iterate through the paragraph elements in the result and get the first title role
    for paragraph in result.paragraphs:
        # Check if the paragraph has a title role
        if paragraph.role == "title":
            title = paragraph.content
            print(f"Title: {title}")
            break  # Exit after the first title is found

    # Iterate through the tables in the result
    final_content = []
    for page_idx, page in enumerate(result.pages):
        text = ""
        for line in page.lines:
            text += line.content + "\n"

        # Get the page number
        page_number = page.page_number
        final_content.append({
            "page_number": page_number,
            "type": "text",
            "content": text,
            "chapter_title": title
        })

    # Iterate through the tables in the result
    for table in result.tables:
        table_content = []
        for cell in table.cells:
            cell_content = {
                "row_index": cell.row_index,
                "column_index": cell.column_index,
                "content": cell.content
            }
            table_content.append(cell_content)

        # Get the page number
        final_content.append({
            "type": "table",
            "content": table_content,
            "chapter_title": title
        })

    return final_content


In [4]:
# Load environment variables from .env file
dotenv.load_dotenv()

# Get endpoint and key from environment variables
endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

# Cliente
client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))

complete_proccesed_files = []
for document_path in tqdm.tqdm(document_paths):
    # Extraer el nombre del archivo sin la extensión
    filename = os.path.splitext(os.path.basename(document_path))[0]
    print(f"Processing {filename}")

    # Crear un directorio para guardar los resultados
    output_dir = os.path.join(data_dir, "../data/output")
    os.makedirs(output_dir, exist_ok=True)

    # Leer el archivo PDF
    with open(document_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-layout", document=f)
        result = poller.result()

        # Extraer el texto del resultado
        extracted_text = extract_info_from_result(result) 

        # Añadir a complete_proccesed_files
        complete_proccesed_files.append(extracted_text)

        # Guardar el resultado en un archivo JSON
        output_file = os.path.join(output_dir, f"{filename}_output.json")
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(str(extracted_text))


  0%|          | 0/32 [00:00<?, ?it/s]

Processing 0494-0521


ERROR:tornado.general:SEND Error: Host unreachable
  0%|          | 0/32 [00:08<?, ?it/s]


KeyboardInterrupt: 

In [60]:
complete_proccesed_files_flatten = []
for file in complete_proccesed_files:
    for item in file:
        complete_proccesed_files_flatten.append(item)

In [63]:
# Guardar el resultado en un archivo JSON
output_dir = os.path.join(data_dir, "../data/output")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "complete_proccesed_files.json")
with open(output_file, "w", encoding="utf-8") as f:
    f.write(str(complete_proccesed_files_flatten))

## Local processing using PyMuPDF

In [42]:
import fitz  # PyMuPDF
import os
import re
def extract_text_from_pdfs(pdf_dir):
    documents = []
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            path = os.path.join(pdf_dir, filename)
            doc = fitz.open(path)
            
            # Iterate through each page in the PDF
            for page_num in range(len(doc)):
                page = doc[page_num]
                text = page.get_text()
                
                if page_num == 0:
                    # Buscar los tres primeros dígitos del número de página (suponemos que es la primera línea con 3 dígitos)
                    match_page = re.search(r"\b(\d{3})\b", text)

                    # Buscar título del capítulo: palabra después de secuencia C\nH\nA... (CHAPTER) y salto de línea
                    match_title = re.search(r"C\nH\nA\nP\nT\nE\nR\n(.*?)\nCONTENTS", text, re.DOTALL)

                    chapter_number = int(match_page.group(1)) if match_page else None
                    chapter_title = match_title.group(1).strip() if match_title else None

                # Append the text to the documents list
                documents.append({
                    'filename': filename,
                    'page_number': chapter_number + page_num if chapter_number else page_num + 1,
                    'chapter_title': chapter_title,
                    'text': text
                })
            
    return documents


In [21]:
content_parsed_bronze = extract_text_from_pdfs("../doc/Temario")

MuPDF error: library error: FT_New_Memory_Face(HDKKPH+Gian5e): unknown file format

