# PDFs analyzer
Tool for extracting content from PDFs and format it to be used in Azure AI Search

## Using Azure Document Intelligence


In [1]:
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
import re
import os
import dotenv
import tqdm

In [8]:
# Get all PDFs in data directory
data_dir = "../doc/Temario/"
document_paths = []
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".pdf"):
            document_paths.append(os.path.join(root, file))
print(f"Found {len(document_paths)} documents in {data_dir}")

Found 32 documents in ../doc/Temario/


In [3]:
def extract_info_from_result(result):
    # Iterate through the paragraph elements in the result and get the first title role
    for paragraph in result.paragraphs:
        # Check if the paragraph has a title role
        if paragraph.role == "title":
            title = paragraph.content
            print(f"Title: {title}")
            break  # Exit after the first title is found

    # Iterate through the tables in the result
    final_content = []
    for page_idx, page in enumerate(result.pages):
        text = ""
        for line in page.lines:
            text += line.content + "\n"

        # Get the page number
        page_number = page.page_number
        final_content.append({
            "page_number": page_number,
            "type": "text",
            "content": text,
            "chapter_title": title
        })

    # Iterate through the tables in the result
    for table in result.tables:
        table_content = []
        for cell in table.cells:
            cell_content = {
                "row_index": cell.row_index,
                "column_index": cell.column_index,
                "content": cell.content
            }
            table_content.append(cell_content)

        # Get the page number
        final_content.append({
            "type": "table",
            "content": table_content,
            "chapter_title": title
        })

    return final_content


In [4]:
# Load environment variables from .env file
dotenv.load_dotenv()

# Get endpoint and key from environment variables
endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")

# Cliente
client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))

complete_proccesed_files = []
for document_path in tqdm.tqdm(document_paths):
    # Extraer el nombre del archivo sin la extensión
    filename = os.path.splitext(os.path.basename(document_path))[0]
    print(f"Processing {filename}")

    # Crear un directorio para guardar los resultados
    output_dir = os.path.join(data_dir, "../data/output")
    os.makedirs(output_dir, exist_ok=True)

    # Leer el archivo PDF
    with open(document_path, "rb") as f:
        poller = client.begin_analyze_document("prebuilt-layout", document=f)
        result = poller.result()

        # Extraer el texto del resultado
        extracted_text = extract_info_from_result(result) 

        # Añadir a complete_proccesed_files
        complete_proccesed_files.append(extracted_text)

        # Guardar el resultado en un archivo JSON
        output_file = os.path.join(output_dir, f"{filename}_output.json")
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(str(extracted_text))


  0%|          | 0/32 [00:00<?, ?it/s]

Processing 0494-0521


ERROR:tornado.general:SEND Error: Host unreachable
  0%|          | 0/32 [00:08<?, ?it/s]


KeyboardInterrupt: 

In [60]:
complete_proccesed_files_flatten = []
for file in complete_proccesed_files:
    for item in file:
        complete_proccesed_files_flatten.append(item)

In [63]:
# Guardar el resultado en un archivo JSON
output_dir = os.path.join(data_dir, "../data/output")
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "complete_proccesed_files.json")
with open(output_file, "w", encoding="utf-8") as f:
    f.write(str(complete_proccesed_files_flatten))

## Local processing using PyMuPDF

In [26]:
import fitz  # PyMuPDF
import os
import re
def extract_text_from_pdfs(pdf_dir, metadata):
    documents = []
    for filename in os.listdir(pdf_dir):
        if filename.endswith(".pdf"):
            path = os.path.join(pdf_dir, filename)
            doc = fitz.open(path)
            
            # Get the metadata using the filename as the key
            metadata_key = filename.split('.')[0]
            metadata_info = metadata.get(metadata_key, {})
            skills = metadata_info.get('skills', [])
            # if skills is not a list, convert it to an empty list
            if not isinstance(skills, list):
                skills = [skills]
            subject = metadata_info.get('subject', 'Unknown Subject')
            difficulty = metadata_info.get('difficulty', 'Unknown Difficulty')
            description = metadata_info.get('description', 'No Description')

            # Iterate through each page in the PDF
            for page_num in range(len(doc)):
                page = doc[page_num]
                text = page.get_text()
                
                if page_num == 0:
                    # Buscar los tres primeros dígitos del número de página (suponemos que es la primera línea con 3 dígitos)
                    match_page = re.search(r"\b(\d{3})\b", text)

                    # Buscar título del capítulo: palabra después de secuencia C\nH\nA... (CHAPTER) y salto de línea
                    match_title = re.search(r"C\nH\nA\nP\nT\nE\nR\n(.*?)\nCONTENTS", text, re.DOTALL)

                    chapter_number = int(match_page.group(1)) if match_page else None
                    chapter_title = match_title.group(1).strip() if match_title else None

                # Append the text to the documents list
                documents.append({
                    'filename': filename,
                    'page_number': chapter_number + page_num if chapter_number else page_num + 1,
                    'chapter_title': chapter_title,
                    'text': text,
                    'skills': skills,
                    'subject': subject,
                    'difficulty': difficulty,
                    'description': description
                })
            
    return documents


In [2]:
content_parsed_bronze = extract_text_from_pdfs("../doc/Temario")

MuPDF error: library error: FT_New_Memory_Face(HDKKPH+Gian5e): unknown file format



# Native Language Processing with Azure Document Intelligence

In [2]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader

from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from langchain_community.vectorstores.azuresearch import AzureSearch

import os
import dotenv
import tqdm
dotenv.load_dotenv()

True

In [None]:
def load_using_azure_document_intelligence(pdf_files, base_path):
    docs = []
    analysis_features = ["ocrHighResolution"]
    for pdf_file in pdf_files:
        loader = AzureAIDocumentIntelligenceLoader(
            file_path=os.path.join(base_path, pdf_file), 
            api_endpoint=os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"), 
            api_key=os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY"),
            analysis_features=analysis_features,
            api_model="prebuilt-layout")
        docs.extend(loader.load())
    # load the pdfs
    return docs

def load_using_pyMuPDF(base_path, metadata):
    pages = extract_text_from_pdfs(base_path, metadata)
    # Save the pages in a JSON file
    output_dir = os.path.join(base_path, "../data/output")
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, "content_parsed_bronze.json")
    with open(output_file, "w", encoding="utf-8") as f:
        import json
        json.dump(pages, f, ensure_ascii=False, indent=4)

    # parse to documents
    docs = []
    for page in pages:
        docs.append(Document(page_content=page["text"], 
                             metadata={"page_number": str(page["page_number"]), 
                                       "chapter_title": str(page["chapter_title"]), 
                                       "filename": str(page["filename"]),
                                        "skills": page["skills"],
                                        "subject": page["subject"],
                                        "difficulty": page["difficulty"],
                                        "description": page["description"]
                                       }
                                       ))
    return docs, pages
     


In [28]:
# Read metadata yaml file
yaml_file = "../doc/Temario/temas_por_secciones/document_label.yaml"

import yaml
with open(yaml_file, 'r') as file:
    metadata = yaml.safe_load(file)

print(metadata)

{'tema1_1': {'document_name': 'tema1_1.pdf', 'skills': ['medidas_e_incertidumbres'], 'subject': 'Introduccion, medidas y estimaciones', 'difficulty': 'Facil', 'description': 'Capitulo 1 secciones 1 a 1.4'}, 'tema1_2': {'document_name': 'tema1_2.pdf', 'skills': 'unidades_y_sistema_estandar', 'subject': 'Introduccion, medidas y estimaciones', 'difficulty': 'Facil', 'description': 'Capitulo 1 secciones 1.5'}, 'tema1_3': {'document_name': 'tema1_3.pdf', 'skills': ['conversion_de_unidades', 'unidades_y_sistema_estandar'], 'subject': 'Introduccion, medidas y estimaciones', 'difficulty': 'Medio', 'description': 'Capitulo 1 secciones 1.6'}, 'tema1_4': {'document_name': 'tema1_4.pdf', 'skills': ['unidades_y_sistema_estandar', 'ordenes_de_magnitud', 'dimensiones_y_analisis_dimensional'], 'subject': 'Introduccion, medidas y estimaciones', 'difficulty': 'Medio', 'description': 'Capitulo 1 secciones 1.7 a 1.8'}, 'tema1_5': {'document_name': 'tema1_5.pdf', 'skills': ['conversion_de_unidades', 'unida

In [None]:
# get all pdfs in the doc/Temario directory
pdf_files = [f for f in os.listdir("../doc/Temario/temas_por_secciones") if f.endswith(".pdf")]

base_path = "../doc/Temario/temas_por_secciones"

docs, pages = load_using_pyMuPDF(base_path, metadata)


In [30]:
# split the docs into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1300, chunk_overlap=100, add_start_index=True)
chunks = text_splitter.split_documents(docs)


In [None]:
# remove the metadata.content field
#for chunk in chunks:
#    chunk.metadata.pop("content")

# save the chunks to a json file
with open("chunks.json", "w") as f:
    f.write(str(chunks))


AttributeError: 'Document' object has no attribute 'to_dict'

In [20]:
chunks[1]

Document(metadata={'page_number': '187', 'chapter_title': 'None', 'filename': 'tema18_3.pdf', 'skills': ['corriente_electrica', 'potencia_electrica'], 'subject': 'Corrientes electricas', 'difficulty': 'Medio', 'description': 'Capitulo 18 secciones 18.5 a 18.6', 'start_index': 1207}, page_content='T = T0 + R - R0\naR0\n= 20.0°C +\n187.4 \x03 - 164.2 \x03\nA3.927 * 10–3(C°)–1B(164.2 \x03) = 56.0°C.\na\nT0 = 20.0°C.\nR0 = r0 l\x02A\nR = R0 C1 + a AT - T0B D.\n(l\x02A)\nR = rl\x02A,\nr,\n187.4 \x03.\n164.2 \x03.\nEXAMPLE 18;7\n510\nCHAPTER 18\nElectric Currents\nP H Y S I C S  A P P L I E D\nResistance thermometer\nEXERCISE G The resistance of the tungsten filament of a common incandescent light-\nbulb is how many times greater at its operating temperature of 2800 K than its resistance\nat room temperature? (a) Less than 1% greater; (b) roughly 10% greater; (c) about \n2 times greater; (d) roughly 10 times greater; (e) more than 100 times greater.\nThe value of \nin Eq. 18–4 can itself dep

In [13]:
# Flat the metadata of the chunks keeping the page content
import random

parsed_chunks = []
for chunk in chunks:
    parsed_chunks.append({
        "id": random.randint(1, 1000000),
        "page_content": chunk.page_content,
        "page_number": chunk.metadata["page_number"],
        "chapter_title": chunk.metadata["chapter_title"],
        "filename": chunk.metadata["filename"],
        "start_index": chunk.metadata["start_index"]
    })
len(parsed_chunks)

3524

In [40]:
# Create the embeddings using Azure OpenAI
embeddings = AzureOpenAIEmbeddings(
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    azure_deployment="text-embedding-002",
    openai_api_version="2024-02-01"
)

In [None]:
def generate_embeddings(text, model="text-embedding-002"):
    """Generate embeddings for a given text using Azure OpenAI."""
    embedding_text = embeddings.embed_query(text)
    return embedding_text

In [None]:
from azure.search.documents.indexes.models import (
    ScoringProfile,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    TextWeights,
)

# Create the index schema
fields = [
        SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(name="content", 
                    type=SearchFieldDataType.String , 
                    searchable=True),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),                
        searchable=True,
                vector_search_dimensions=len(embeddings.embed_query("Text")), 
                vector_search_profile_name="myHnswProfile"
                ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchableField(name="skills", type=SearchFieldDataType.Collection(SearchFieldDataType.String), searchable=True),
    SearchableField(name="subject", type=SearchFieldDataType.String, searchable=True),
    SearchableField(name="difficulty", type=SearchFieldDataType.String, searchable=True),
    SearchableField(name="description", type=SearchFieldDataType.String, searchable=True),
    SearchableField(name="chapter_title", 
                    type=SearchFieldDataType.String, 
                    searchable=True),
    SearchableField(name="filename", 
                    type=SearchFieldDataType.String, 
                    searchable=True),
    SearchableField(name="start_index", 
                    type=SearchFieldDataType.String, 
                    searchable=True),
]


In [33]:
# print the unique types of the metadata
metadata_types = set()
types = [type(chunk.metadata["description"]) for chunk in chunks]
set_types = set(types)
print(set_types)

{<class 'str'>}


In [35]:

# Create the Azure AI Search index
index_name = "temario-index-v1"

vector_store: AzureSearch = AzureSearch(
    embedding_function=embeddings.embed_query,
    azure_search_endpoint="https://test-tutoria.search.windows.net",
    azure_search_key="bZ3ejcekyuQPon45GLBCAan0fXhUv5rWoJFjdmJMgRAzSeDnI3hX",
    index_name=index_name,
    additional_search_client_options={"retry_total": 3},
    fields=fields
)

# Add the chunks to the vector store in batches of 500
batch_size = 50
for i in tqdm.tqdm(range(0, len(chunks), batch_size)):
    batch = chunks[i:i + batch_size]
    vector_store.add_documents(batch)


  0%|          | 0/17 [00:24<?, ?it/s]


HttpResponseError: () The request is invalid. Details: An unexpected 'StartArray' node was found when reading from the JSON reader. A 'PrimitiveValue' node was expected.
Code: 
Message: The request is invalid. Details: An unexpected 'StartArray' node was found when reading from the JSON reader. A 'PrimitiveValue' node was expected.

In [40]:
# perform a similarity search
query = "What is the main topic of the document?"
results = vector_store.similarity_search(query)

# print the results
for result in results:
    print(result.page_content)
    print(result.metadata["page_number"])
    print(result.metadata["chapter_title"])
    print("\n")

You too can experience rapid
rotation-if your stomach can
take the high angular velocity
and centripetal acceleration of
some of the faster amusement
park rides. If not, try the slower
merry-go-round or Ferris
wheel. Rotating carnival rides
have rotational kinetic energy
as well as angular momentum.
Angular acceleration is
produced by a net torque, and
rotating objects have rotational
kinetic energy.


<figure>

CHAPTER
8

</figure>


# CONTENTS

8-1 Angular Quantities

8-2 Constant Angular Acceleration

8-3 Rolling Motion
(Without Slipping)

8-4 Torque

8-5 Rotational Dynamics;
Torque and Rotational Inertia

8-6 Solving Problems in
Rotational Dynamics

8-7 Rotational Kinetic Energy

8-8 Angular Momentum and
Its Conservation

*8-9 Vector Nature of
Angular Quantities


<figure>
</figure>


# Rotational Motion


## CHAPTER-OPENING QUESTION-Guess now!

A solid ball and a solid cylinder roll down a ramp. They both start from rest at the
same time and place. Which gets to the bottom first?


KeyError: 'page_number'