In [12]:
# Check if Poppler is installed
!command -v pdftotext > /dev/null && echo "Poppler is installed." || echo "Poppler is not installed. Please run 'brew install poppler'."

Poppler is installed.


In [13]:
from pathlib import Path
import shutil

import re

import base64
from dotenv import load_dotenv
from uuid import uuid4
from IPython.display import display, Markdown

from pdf2image import convert_from_path

from langchain_openai import ChatOpenAI

from langchain_core.documents import Document
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import BaseOutputParser
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore

from tenacity import retry, wait_exponential, stop_after_attempt
from tqdm import tqdm

from PIL import Image as PIL_Image

load_dotenv(override=True)

True

#### Configs

In [14]:
# doc
DOCUMENT_NAME = "SEH_Nouvelle_Version.pdf"

In [15]:
# Dir
DOC_DIR = Path("__file__").parent / "data/cash/seh/"
DOC_DIR.mkdir(parents=True, exist_ok=True)

IMG_DIR = Path("__file__").parent / "data/cash/seh/images"
if IMG_DIR.exists():
    for item in IMG_DIR.iterdir():
        if item.is_file():
            item.unlink()
        else:
            shutil.rmtree(item)
IMG_DIR.mkdir(parents=True, exist_ok=True)

In [16]:
# Document Path
DOCUMENT_PATH = DOC_DIR / DOCUMENT_NAME

In [17]:
# MLLM
MLLM_MODEL = "gpt-4o"
MLLM_TEMPERATURE = 0

mllm = ChatOpenAI(
    model=MLLM_MODEL,
    temperature=MLLM_TEMPERATURE,
)

#### Utils

In [18]:
def print_image(image_path):
    """print image in notebook"""
    pil_image = PIL_Image.open(image_path)
    original_width, original_height = pil_image.size

    # Resize the image
    new_width = 512
    new_height = int((new_width / original_width) * original_height)
    resized_image = pil_image.resize((new_width, new_height), PIL_Image.LANCZOS)

    return resized_image

In [19]:
def print_mk(text):
    """Prints text as markdown in Jupyter Notebook."""
    display(Markdown(text))

In [20]:
def encode_image(image_path):
    """Encodes an image to base64."""
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
    return encoded_string

#### Parse PDF

In [21]:
# Convert each page of the PDF to an image
images = convert_from_path(DOCUMENT_PATH, thread_count=4, fmt="JPEG", dpi=300)

In [22]:
# Add img_path to each page
pages = []
for image in enumerate(images):
    image_name = f"{str(uuid4())}.jpg"
    # Save the image to a file
    image_path = IMG_DIR / image_name
    image[1].save(image_path, "JPEG")

    encoded_image = encode_image(image_path)

    pages.append(
        Document(
            page_content="",
            metadata={
                "img_path": str(IMG_DIR / image_name),
                "img_encoded": encoded_image,
                "page": image[0] + 1,
            },
        )
    )

In [23]:
%%time
# Parse Images as Markdown

PARSING_PROMPT = """
**Objective:** Directly extract content from the input image (representing a document page) into Markdown format, maintaining the visual sequence and providing detailed descriptions for visual elements.

**Instructions:**

1.  **Sequential Extraction:** Process the image content following the natural reading order (e.g., top-to-bottom, left-to-right, respecting columns).
2.  **Text Extraction:** Extract all visible text blocks. Preserve approximate paragraph or line structure as observed on the page. Add Markdown syntax for title, headings, lists, and other formatting as appropriate.
3.  **Table Extraction:** Identify any tables. Extract their content (headers and rows) accurately and format them as standard Markdown tables. Place the Markdown table in the output sequence where it appears on the page.
4.  **Visual Element Description:**
    * Identify significant non-text visual elements (e.g., images, photographs, charts, graphs, diagrams, illustrations).
    * In the output sequence where the visual element appears, insert a detailed description enclosed within `<VISUAL ELEMENT>` and `</VISUAL ELEMENT>` tags.
    * **The description should be comprehensive and accurate, covering:**
        * The type of visual (e.g., "bar chart," "photograph," "flowchart").
        * The main subject or data represented.
        * Key features, trends, or objects depicted.
        * Any clearly legible text *within* the visual (like titles, labels, captions, data points).
    * **Example Format:**
        <VISUAL ELEMENT>
        Bar chart comparing sales figures for Q1-Q4 2024. Shows significant growth in Q3. Categories are Product A, Product B, Product C. Text labels include 'Quarterly Sales 2024', 'Sales (in $M)', and specific values on axes.
        </VISUAL ELEMENT>
5.  **Output Generation:** Combine the extracted text, Markdown tables, and the tagged visual descriptions into a single, continuous Markdown stream that reflects the original page order.
6.  **Strict Output:** Return *only* the generated Markdown content. Do not include any introductory phrases, concluding remarks, or other commentary outside the extracted content itself.
"""


@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10),
)
def parse_image(image_encoded):
    class CustomMarkdownParser(BaseOutputParser):
        def parse(self, text: str) -> str:
            text = text.strip()
            fence_re = re.compile(r"```(?:\w+)?\s*(.*?)\s*```", flags=re.S)
            text = fence_re.sub(lambda m: m.group(1).strip(), text)
            return text

    image_message = {
        "type": "image_url",
        "image_url": {"url": f"data:image/jpeg;base64,{image_encoded}"},
    }
    text_message = {
        "type": "text",
        "text": PARSING_PROMPT,
    }

    message = HumanMessage(content=[text_message, image_message])

    output = mllm.invoke([message]).content
    output = CustomMarkdownParser().parse(output)
    return output


# Parse each page and print the response
for page in tqdm(pages):
    page.page_content = parse_image(page.metadata["img_encoded"])

100%|██████████| 9/9 [03:10<00:00, 21.15s/it]

CPU times: user 123 ms, sys: 43.9 ms, total: 167 ms
Wall time: 3min 10s





In [24]:
# visual inspection of the first page
pages[0].__dict__

{'id': None,
 'metadata': {'img_path': 'data/cash/seh/images/0803d051-4624-4d2a-85f9-be04e7d2eb3d.jpg',
  'img_encoded': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAzkCfYDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDSooor6k+NCiiigAooooAKKKKACiiigAooooAKKKKACiiigA

In [25]:
# print extracted content in markdown
print_mk(pages[0].page_content)

# Mode d'emploi Autisme

**Transmettre Sensibiliser Accompagner**

Des fiches pratiques sur l'autisme pour vous accompagner au quotidien

---

## Supplément pour enfant handicapé

# Comment rédiger votre lettre type

Le supplément pour enfant handicapé (SEH) est une aide financière pour les familles. Nous vous recommandons d’ajouter une lettre type à votre formulaire. Présentez cette lettre type à l’école et au spécialiste consulté avant de leur faire compléter leur partie du formulaire.

Pour avoir droit au supplément pour enfant handicapé, une personne doit :
- être admissible à l’Allocation famille
- avoir à sa charge un enfant de moins de 18 ans ayant une déficience ou un trouble des fonctions mentales qui le limite de façon importante dans la réalisation de ses habitudes de vie pendant une période prévisible d’au moins un an.

Quelles sont les « habitudes de vie » ?  
Les habitudes de vie sont celles qu’un enfant doit réaliser, selon son âge, pour prendre soin de lui-même et participer à la vie sociale. Les habitudes de vie qui sont considérées dans l’analyse des demandes sont les suivantes :
- la nutrition;
- les soins personnels;
- les déplacements;
- la communication;
- les relations interpersonnelles;
- les responsabilités;
- l’éducation.

<VISUAL ELEMENT>
Illustration of a person checking off items on a large checklist. The person is standing next to a large paper with checkmarks. The background includes abstract plant designs.
</VISUAL ELEMENT>

---

<VISUAL ELEMENT>
Circular diagram with text: "Le formulaire Retraite Québec - Demande de supplément pour enfant handicapé" and "Le bulletin Retraite Québec - Critères spécifiques".
</VISUAL ELEMENT>

---

<VISUAL ELEMENT>
Icon with text: "Cliquez ici pour savoir qui peut vous aider et qui peut compléter le formulaire."
</VISUAL ELEMENT>

Parce qu’il n’est pas toujours facile d’avoir une idée précise de ce qui se cache derrière ces mots, nous vous proposons de vous donner quelques clefs pour bien rédiger votre lettre type et ne pas oublier de mentionner certaines caractéristiques importantes de votre enfant.

---

**Fédération québécoise de l’autisme**  
3396, rue Jean-Talon Est, Montréal (Québec) H2A 1W8  
Téléphone : 514.270.7386 ou 1 888 830.2833  
Courriel : info@autisme.qc.ca  
Site internet: autisme.qc.ca

In [None]:
## NOTE: Uncomment the following line to print the image in the notebook
# print_image(pages[0].metadata["img_path"])

#### Convert Pages into Chunks

In [28]:
# Split the parsed pages into smaller chunks
md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=1200, chunk_overlap=200
)

chunks = md_splitter.split_documents(pages)
print(f"Total chunks: {len(chunks)}")

Total chunks: 43


In [29]:
# visual inspection the chunk#3
chunks[3].__dict__

{'id': None,
 'metadata': {'img_path': 'data/cash/seh/images/0803d051-4624-4d2a-85f9-be04e7d2eb3d.jpg',
  'img_encoded': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAzkCfYDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwDSooor6k+NCiiigAooooAKKKKACiiigAooooAKKKKACiiigA

In [30]:
# print extracted content in markdown
print_mk(chunks[3].page_content)

---

<VISUAL ELEMENT>
Circular diagram with text: "Le formulaire Retraite Québec - Demande de supplément pour enfant handicapé" and "Le bulletin Retraite Québec - Critères spécifiques".
</VISUAL ELEMENT>

---

<VISUAL ELEMENT>
Icon with text: "Cliquez ici pour savoir qui peut vous aider et qui peut compléter le formulaire."
</VISUAL ELEMENT>

Parce qu’il n’est pas toujours facile d’avoir une idée précise de ce qui se cache derrière ces mots, nous vous proposons de vous donner quelques clefs pour bien rédiger votre lettre type et ne pas oublier de mentionner certaines caractéristiques importantes de votre enfant.

---

**Fédération québécoise de l’autisme**  
3396, rue Jean-Talon Est, Montréal (Québec) H2A 1W8  
Téléphone : 514.270.7386 ou 1 888 830.2833  
Courriel : info@autisme.qc.ca  
Site internet: autisme.qc.ca

In [31]:
# Sanity check of the chunks

low_quality_chunks = []
for i, chunk in enumerate(chunks):
    # Check if the chunk is too small or too large
    if len(chunk.page_content) < 100 or len(chunk.page_content) > 1200:
        low_quality_chunks.append(chunk)
print(f"Total low quality chunks: {len(low_quality_chunks)}/{len(chunks)}")

Total low quality chunks: 2/43


#### Cleanup

In [33]:
# Delete images folder
shutil.rmtree(IMG_DIR)