In [1]:
import os
from dotenv import load_dotenv
from mistralai import Mistral
from llama_cloud_services import LlamaParse
import base64
import nest_asyncio
import asyncio
from markitdown import MarkItDown
from docling.document_converter import DocumentConverter


nest_asyncio.apply()
load_dotenv()

link_SEH_form = "../data/cash/seh/LPF825_fr.pdf"
link_SEH_help_hard = "../data/cash/seh/Fiche-aides-financieres-moins-de-18-ans_mise-a-jour-le-18-dec.-2024.pdf"
link_SEH_help_easy = "../data/cash/seh/SEH_Nouvelle_Version.pdf"



  from .autonotebook import tqdm as notebook_tqdm


In [30]:
# Mistral
class Mistral_ocr:
    def __init__(self,document_url):
        self.client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])
        self.document_url = document_url
        self.ocr_response = self.get_ocr_response()
    def encode_pdf(self,pdf_path):
        """Encode the pdf to base64."""
        try:
            with open(pdf_path, "rb") as pdf_file:
                return base64.b64encode(pdf_file.read()).decode('utf-8')
            
        except FileNotFoundError:
            print(f"Error: The file {pdf_path} was not found.")
            return None
        except Exception as e:  # Added general exception handling
            print(f"Error: {e}")
            return None
    def get_ocr_response(self):
        base64_pdf = self.encode_pdf(self.document_url)
        ocr_response_form = self.client.ocr.process(
                model="mistral-ocr-latest",
                document={
                    "type": "document_url",
                    "document_url": f"data:application/pdf;base64,{base64_pdf}" 
                },
                include_image_base64=True
            )
        return ocr_response_form
    def get_markdown(self):
        return  "\n\n".join(page.markdown for page in self.ocr_response.pages)

    def get_markdown_from_page(self, page_index):
        return self.ocr_response.pages[page_index].markdown
    
class LlamaParse_ocr:
    def __init__(self,document_url):
        self.parser = LlamaParse(
            api_key=os.environ["LLAMA_CLOUD_API_KEY"],
        verbose=True,
        language="fr",       
        balance_mode=True
        )
        self.document_url = document_url
        self.ocr_response = self.get_ocr_response()     

    def get_ocr_response(self):
        try:
            loop = asyncio.get_running_loop()
            # Si on est déjà dans une boucle, on utilise la version async
            return loop.run_until_complete(self.parser.aparse(self.document_url))
        except RuntimeError:
            # Sinon, on lance une nouvelle boucle
            return asyncio.run(self.parser.aparse(self.document_url))
        
    def get_markdown(self):
        return self.ocr_response.get_markdown_documents()[0].text 
    
    def get_markdown_from_page(self, page_index):
        markdown_documents = self.ocr_response.get_markdown_documents(split_by_page=True)
        return markdown_documents[page_index].text

class Markitdown_ocr:
    def __init__(self,document_url):
        self.document_url = document_url
        self.ocr_response = self.get_ocr_response()     

    def get_ocr_response(self):
        markdown_engine = MarkItDown()
        return markdown_engine.convert(self.document_url)
        

    def get_markdown(self):
        return self.ocr_response.text_content

class Docling_ocr:
    def __init__(self,document_url):
        self.document_url = document_url
        self.ocr_response = self.get_ocr_response()     

    def get_ocr_response(self):
       converter = DocumentConverter()
       result = converter.convert(self.document_url)
       return result

    def get_markdown(self):
        markdown_documents = self.ocr_response.document.export_to_markdown()
        return markdown_documents


In [4]:
Docling_help_hard = Docling_ocr(link_SEH_help_hard)
Docling_form = Docling_ocr(link_SEH_form)
Docling_help_easy = Docling_ocr(link_SEH_help_easy)

with open("./data/Fiche-aide-financiere/dataDocling.md", "w", encoding="utf-8") as f:
    f.write(Docling_help_hard.get_markdown())
    
with open("./data/LPF825/dataDocling.md", "w", encoding="utf-8") as f:
    f.write(Docling_form.get_markdown())

with open("./data/SEH_Nouvelle_version/dataDocling.md", "w", encoding="utf-8") as f:
    f.write(Docling_help_easy.get_markdown())


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  np.nanmean(
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  np.nanmean(
  np.nanmean(
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  np.nanmean(


In [None]:
Mistra_form = Mistral_ocr(link_SEH_form)
Mistral_help_hard = Mistral_ocr(link_SEH_help_hard)
Mistral_help_easy = Mistral_ocr(link_SEH_help_easy)

with open("./data/Fiche-aide-financiere/mistral.md", "w", encoding="utf-8") as f:
    f.write(Mistral_help_hard.get_markdown())
    
with open("./data/LPF825/mistral.md", "w", encoding="utf-8") as f:
    f.write(Mistra_form.get_markdown())

with open("./data/SEH_Nouvelle_version/mistral.md", "w", encoding="utf-8") as f:
    f.write(Mistral_help_easy.get_markdown())

In [31]:
LLama_form = LlamaParse_ocr(link_SEH_form)
LLama_help_hard = LlamaParse_ocr(link_SEH_help_hard)
LLama_help_easy = LlamaParse_ocr(link_SEH_help_easy)

with open("./data/LPF825/llama_parse.md", "w", encoding="utf-8") as f:
    f.write(LLama_form.get_markdown())

with open("./data/SEH_Nouvelle_version/llama_parse.md", "w", encoding="utf-8") as f:
    f.write(LLama_help_easy.get_markdown())

with open("./data/Fiche-aide-financiere/llama_parse.md", "w", encoding="utf-8") as f:
    f.write(LLama_help_hard.get_markdown())



Started parsing the file under job_id aaed580e-e14f-48ca-98cb-90da4c5dfadb
Started parsing the file under job_id 97b294e0-79d3-4c31-9cc8-91b570eb5557
Started parsing the file under job_id 9fb1553d-506b-4367-ae9f-6c4bd24bf284


In [32]:
Markitdown_form = Markitdown_ocr(link_SEH_form)
Markitdown_help_hard = Markitdown_ocr(link_SEH_help_hard)
Markitdown_help_easy = Markitdown_ocr(link_SEH_help_easy)

with open("./data/LPF825/markitdown.md", "w", encoding="utf-8") as f:
    f.write(Markitdown_form.get_markdown())

with open("./data/SEH_Nouvelle_version/markitdown.md", "w", encoding="utf-8") as f:
    f.write(Markitdown_help_easy.get_markdown())

with open("./data/Fiche-aide-financiere/markitdown.md", "w", encoding="utf-8") as f:
    f.write(Markitdown_help_hard.get_markdown())
