# Pipeline

* Input1: pdf file to process with Mistral
* Input2: Docling Document (doc.save_as_json())
* Output:  a .md file enriched


This notebooks shows how to clean the Docling document with MistralAI.  
It shows

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import base64
import json
from pathlib import Path
from dotenv import load_dotenv
from mistralai import Mistral, DocumentURLChunk
from mistralai.models import OCRResponse
from mistralai.extra import response_format_from_pydantic_model
from openai import OpenAI

from src import mistral_pl, pipeline as pl, prompts
from IPython.display import display, Markdown, Image as dImage

# Params

In [None]:
path_data = Path() / "data"
path_input = path_data / "raw"  # place images input pdf file 
path_pipeline = path_data / "pipeline" # place images from docling extraction


path_pipeline.mkdir(exist_ok=True, parents=True)

In [None]:
IMAGE_RESOLUTION_SCALE = 2.0 
NUM_PAGES = 3   # <9
LLM = "gpt-4.1-mini-2025-04-14"


In [None]:
input_file = "Divulgacion Planetaria Althera.pdf" 
output_file = "Divulgacion-Planetaria-Althera.md"


path_input_doc = path_input / input_file
# json containening the analysis of the document by mistral ocr document annotaion
path_doc_analysis = path_pipelin / "doc_analysis.json"  


# Environment Variables

In [6]:
load_dotenv()

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
MISTRAL_API_KEY = os.environ["MISTRAL_API_KEY"]

In [7]:
path_artifacts = Path.home() / ".cache" / "docling" / "models"  # Docling models are stored here

path_artifacts.exists()

True

# Clients

In [8]:
client_mistral = Mistral(api_key=MISTRAL_API_KEY)
client_openai = OpenAI(
    api_key=OPENAI_API_KEY
)


## PDF Conversion

## Analyze

In [None]:
id_file = mistral_pl.upload_if_not_exists(
    client_mistral, 
    path_input_doc
    )

In [10]:
signed_url = client_mistral.files.get_signed_url(
    file_id=id_file,
    expiry=1
    )

In [30]:
doc_analysis = mistral_pl.analyze_document(
    client_mistral,
    signed_url.url,
    path_doc_analysis
)
doc_analysis

Document(language='es', index_detected=True, chapter_titles_from_index=['Historia del descubrimiento', 'Conoce a Althéra', 'Los soles de Althéra', 'Estructura general de Althéra', 'Planetas interiores', 'Planetas exteriores', 'Lunas y satélites menores', 'Fenómenos destacados', 'Habitabilidad y astrobiología', 'Conclusiones y perspectivas futuras'], title_detected=True, title='Un nuevo y fascinante vecino: Althéra', page_header_detected=True, page_header=['Divulgación Planetaria: 2025-07'], page_footer_detected=True, page_footer=['PÃ¡gina 2 | 15', 'PÃ¡gina 3 | 15'])

## Remove Headers Programmatically with extact match

In [None]:
example = doc_analysis.page_header[0]
example

In [41]:
texts = [
    "Divulgación Planetaria: 2025-07\nAlthéra B (HD 4579 B), una enana",
    "Esta combinación de espectros produce un ambiente luminoso único"
]

for i, text in enumerate(texts):
    if example in text:
        print(f"'{example}' is in '{i=}'")

'Divulgación Planetaria: 2025-07' is in 'i=0'


## Remove Footers Programmatically with an LLM generated regex

In [None]:
examples = '\n'.join(doc_analysis.page_footer)



PROMPT_PAGE_FOOTER_REGEX = prompts.build_regex_footer(examples)




In [None]:
text1 = "más masiva y luminosa que el Sol.\n Página 8 | 15"
text2 = "en la pagina 7 no vienenada"
text3 = "Página 8 | 15\nDivulgación Planetaria: 2025-07"


response = client_openai.responses.create(
    model=LLM,
    input=[
        {"role": "system", "content": "You are a Python regex expert."},
        {
            "role": "user",
            "content": PROMPT_PAGE_FOOTER_REGEX,
        },
    ],
)


regex_footer = response.output_text


print(f"{regex_footer}")
print("-"*30)
match1 = pl.is_valid_input(regex_footer, text1)
print(f"{match1}")
match2 = pl.is_valid_input(regex_footer, text2)
print(f"{match2}")
match3 = pl.is_valid_input(regex_footer, text3)
print(f"{match3}")

P[aá]gina\s+\d+\s*\|\s*\d+
------------------------------
True
False
True


## Detect level 1 section titles and fix them

In [52]:
lst_index = doc_analysis.chapter_titles_from_index
lst_index

['Historia del descubrimiento',
 'Conoce a Althéra',
 'Los soles de Althéra',
 'Estructura general de Althéra',
 'Planetas interiores',
 'Planetas exteriores',
 'Lunas y satélites menores',
 'Fenómenos destacados',
 'Habitabilidad y astrobiología',
 'Conclusiones y perspectivas futuras']

In [57]:
texts = [
    "Historia del descubrimiento",
    "# 1.Historia del descubrimiento",
    "# Historia del descubrimiento",
    "## Historia del descubrimiento",
    "## 5. Planetas interiores",
    "## 4.4 Estabilidad orbital en un sistema binario"
]

for i, text in enumerate(texts):
    res = pl.fix_section_title_l1_candidate(text, lst_index)
    print(f"{i=}: {res}")
    

i=0: Historia del descubrimiento
i=1: # 1.Historia del descubrimiento
i=2: # Historia del descubrimiento
i=3: # Historia del descubrimiento
i=4: ## 5. Planetas interiores
i=5: ## 4.4 Estabilidad orbital en un sistema binario


## Enrich an image with its content description

In [None]:
def encode_image(image_path):
    """Encode the image to base64."""
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {image_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None
    
image3_b64 = encode_image(path_pipeline / "Divulgacion Planetaria Althera-picture-3.png")


In [None]:
lst_images_names = [
    "Divulgacion Planetaria Althera-picture-1.png",
    "Divulgacion Planetaria Althera-picture-2.png",
    "Divulgacion Planetaria Althera-picture-3.png"
]

dc_images_b64 = {name: encode_image(path_pipeline / name) for name in lst_images_names}

In [81]:
dc_images_content = mistral_pl.process_images(client_mistral, dc_images_b64)
dc_images_content


{'Divulgacion Planetaria Althera-picture-1.png': '.',
 'Divulgacion Planetaria Althera-picture-2.png': '# Planetas del sistema Althéra: distancia vs masa\n\n|  Masa (M) | Distancia al baricentro (UA)  |\n| --- | --- |\n|  1 | 7  |\n|  2 | 6  |\n|  3 | 5  |\n|  4 | 4  |\n|  5 | 5  |\n|  6 | 6  |\n|  7 | 7  |\n|  8 | 8  |\n\n![img-0.jpeg](img-0.jpeg)',
 'Divulgacion Planetaria Althera-picture-3.png': '& Bellatrix\n\n*Betalgeuse\n\nCampoin 13 oct-2034\n- Oscilaciones periódicas\ndobles\n- Posible sistema binario\n- Explanadas?\n× Rigel\n× Suph\n× Rigel'}

# Exercise 1: Process a PDF file with docling
1. Get a docling document
2. Check conversion confidence
3. Visualize de MD

In [None]:
# TODO

# Exersice 2: Fix L1 Headers
1. Iterate over any SectionHeaderItem in the `.texts` argument of the docling document,
2. USe `pl.fix_section_title_l1_candidate()` and `lst_index` to pass matches to level 1  
Ensure that the MD L1 is only assigned to the first occurrence of the title and actual sections

In [None]:
# TODO

# Exersice 3: Remove Headers
1. Iterate over any TextItem in the `.texts` argument of the docling document
2. If the text exactly matches the extracted header, append the element to a list to delete

In [None]:
# TODO

# Exercise 4: Remove Footers
1. Iterate over any TextItem in the `.texts` argument of the docling document
2. Use `pl.is_valid_input()` and the LLM generated `regex_footer` to match any footer, append the element to a list to delete

In [82]:
# TODO

# Exercise 5: Enrich image content
1. Identify each image related text in the docling document, remember that ImageItem has a children argument
2. Place the extracted content on a proper TextItem or directly as a caption

In [None]:
# TODO

# Exercise 6: Remove Docling Items and convert to markdonw
1. Remove selected docling items from the document
2. Write a cleaned and enriched MD fike

In [None]:
# TODO