In [5]:
import json
from pathlib import Path
from pdf2image import convert_from_path
import pytesseract
from ocr.config import PDF_DIR, JSN_DIR

def ocr_invoice(pdf_path):
    """
    Perform OCR on an invoice in PDF format.

    Parameters:
    pdf_path (str): The file path to the PDF invoice.

    Returns:
    str: The extracted text from the PDF.
    """
    # Convert PDF to a list of images
    images = convert_from_path(pdf_path)

    # Initialize an empty string to hold the extracted text
    extracted_text = ""

    # Perform OCR on each image
    for image in images:
        text = pytesseract.image_to_string(image)
        extracted_text += text + "\n"

    return extracted_text

def save_to_json(data, json_path):
    """
    Save data to a JSON file.

    Parameters:
    data (str): The data to be saved.
    json_path (str): The file path to the JSON file.
    """
    with open(json_path, 'w') as json_file:
        json.dump({"extracted_text": data}, json_file, ensure_ascii=False, indent=4)

# Example usage
pdf_path = Path(PDF_DIR).joinpath('20240705-02700987.pdf')
extracted_text = ocr_invoice(pdf_path)

json_path = Path(JSN_DIR).joinpath('extracted_text.json')
save_to_json(extracted_text, json_path)

import pprint as pp
#pp.pprint(extracted_text)
for l in extracted_text.split('\n'):
    print(l.split())

['Facture', '20240705-02700987']
[]
['Emetteur']
[]
['Chargemap']
[]
['7', 'Allée', 'Cérés']
[]
['67000', 'Strasbourg']
[]
['France']
['accounting@chargemap.com']
[]
['Date', 'de', 'facturation']
['05/07/2024']
[]
['Libellé']
[]
['Charge', 'effectuée', 'le', 'samedi,', 'le', '1', 'juin', '2024', 'a', '14h35', 'a', 'Interparking']
['Confluence', '-', 'Rue', 'du', 'Grognon', '-', 'Namur', '-', 'Quentin', 'Deliére', '-', '2-EWZ-782']
['(Model', '3)']
[]
['Charge', 'effectuée', 'le', 'samedi,', 'le', '8', 'juin', '2024', 'a', '14h39', 'a', 'Interparking']
['Confluence', '-', 'Rue', 'du', 'Grognon', '-', 'Namur', '-', 'Quentin', 'Deliére', '-', '2-EWZ-782']
['(Model', '3)']
[]
['Charge', 'effectuée', 'le', 'samedi,', 'le', '15', 'juin', '2024', 'a', '19h37', 'a', 'Interparking', '-']
['Namur', '-', 'rue', 'du', 'Grognon', '2', '-', 'Quentin', 'Deliére', '-', '2-EWZ-782', '(Model', '3)']
[]
['Charge', 'effectuée', 'le', 'jeudi,', 'le', '27', 'juin', '2024', 'a', '07h19', 'a', 'Allego', '-', 