In [8]:
import fitz  # PyMuPDF
import cv2
import numpy as np
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def convert_to_grayscale(image):
    if len(image.shape) == 2 or image.shape[2] == 1:
        return image
    elif image.shape[2] == 3:
        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    elif image.shape[2] == 4:
        return cv2.cvtColor(image, cv2.COLOR_BGRA2GRAY)

def count_pixels_below_value(image, value):
    return np.sum(image < value)

def is_checkbox_marked(checkbox_region, threshold=0.5, dark_pixel_value=128):
    gray_region = convert_to_grayscale(checkbox_region)
    total_pixels = gray_region.size
    dark_pixels_count = count_pixels_below_value(gray_region, dark_pixel_value)
    dark_pixels_percentage = dark_pixels_count / total_pixels
    return dark_pixels_percentage >= threshold

def process_image(image, page):
    gray = convert_to_grayscale(image)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)

    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Define minimum and maximum dimensions for a checkbox
    min_dim = 8  # Minimum dimension for a checkbox to be considered
    max_dim = 15  # Maximum dimension for a checkbox to be considered
    min_aspect_ratio = 0.8  # Minimum aspect ratio to consider as checkbox
    max_aspect_ratio = 1.2  # Maximum aspect ratio to consider as checkbox

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # Filter out non-checkbox contours by dimension and aspect ratio
        if w < min_dim or h < min_dim or w > max_dim or h > max_dim:
            continue
        if not min_aspect_ratio <= (w / h) <= max_aspect_ratio:
            continue
        checkbox_region = gray[y:y+h, x:x+w]
        marked = is_checkbox_marked(checkbox_region)
        text_x, text_y = x + w, y + h // 2  # Adjust as needed for text placement
        if marked:
            # Insert a checked box Unicode character
            page.insert_text((text_x, text_y), "box", fontsize=11, color=(0, 0, 1))
        else:
            # Insert an unchecked box Unicode character
            page.insert_text((text_x, text_y), "no box", fontsize=11, color=(1, 0, 0))

pdf_path = 'test_0.pdf'
pdf_document = fitz.open(pdf_path)

for page_number in range(len(pdf_document)):
    page = pdf_document.load_page(page_number)
    zoom = 2  # A zoom factor of 2 improves the quality of the image
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat)
    image_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
    image_np = cv2.cvtColor(image_np, cv2.COLOR_RGBA2BGR if pix.n == 4 else cv2.COLOR_RGB2BGR)
    process_image(image_np, page)

pdf_document.save("annotated_output.pdf")
pdf_document.close()


In [None]:
[
  {
    "titre": {
      "nom_etablissement": "string"
    },
    "date": {
      "date_rcp": "string",
      "nom_responsable": "string",
      "membres": {
        "hepatogastroenterologues": [],
        "oncologues": [],
        "radiologues_interventionnels": [],
        "chirurgiens": [],
        "radiotherapeutes": [],
        "anatomopathologiste": [],
        "radiologue_diagnostique": [],
        "infirmiers": []
      },
      "patient": {
        "sexe": "0,1,2",
        "date_naissance": "string",
        "coordonnees_medecin_traitant": "string",
        "coordonnees_medecin_adresseur": "string",
        "dossier_passe_RCP": "bool"
      },
      "motif": {
        "1ere_presentation": "bool",
        "dossier_deja_discute_le": "string",
        "motif_presentation": {
          "decision_traitement": "bool",
          "avis_diagnostique": "bool",
          "ajustement_therapeutique": "bool",
          "surveillance_post_traitement": "bool"
        }
      },
      "atcd": {
        "diabete_cardiovasculaire_comorbidite": "string",
        "traitement_courant": "string"
      }
    },
    "Donnees_Biologique_moins_de_4_semaines": {
      "Bilirubine_totale(µmol/L)": "float",
      "Creatinine(µmol/L)": "float",
      "Albumine(g/L)": "float",
      "Plaquettes(G/L)": "float",
      "TP": {
        "pourcentage": "float",
        "AVK_ou_AOD": "bool"
      },
      "Facteur_V(%)": "float",
      "Alpha-foetoprotéine(ng/mL)": "float"
    },
    "Souhait_prise_en_charge_post_rcp": {
      "accord": "bool",
      "souhait_patient": "str",
      "eligibilite": "bool"
    },
    "Information_patient_post_RCP": {
      "accord_pour_prévenir_prise_en_charge": "bool",
      "comment_informer": "bool"
    },
    "Decision_prise_en_charge": {
      "Transplantation": "bool",
      "Radio_embolisation": "bool",
      "Exerese_Chirurgie": {
        "value": "bool",
        "précision": "str"
      },
      "Chimio_Embolisation": "bool",
      "Traitement_systémique": "bool",
      "Essaie_thérapeutique": "bool",
      "Radiothérapie": "bool",
      "Soins_de_confort": "bool",
      "Precision/hiearchisation_si_propositions": "string"
    }
  "Cirrhose documentee": False,
    "arguments": {
        "clinique": False,
        "marqueurs non-invasifs": False,
        "imagerie": False,
        "biopsie": False
    }
}gastroscopie = {
    "oui, date de la dernière": str,
    "non": False,
    "Varices_oesophagiennes_gastriques": {
        "Non recherchées": False,
        "Absentes": False,
        "Présentes, grade des VO:": str,
    }
}ascite = {
    "Absente": bool,
    "Modérée": bool,
    "Abondante": bool,
    "Pas d'ascite clinique mais visible en imagerie": bool,
    "Pas d'ascite actuellement mais ATCD d'ascite": bool
}encephalopathie = {
    "oui": bool,
    "non": bool,
    "Si oui, date du diagnostic": str
}alcool = {
    "OUI, depuis combien de temps": str,
    "NON, consommation actuelle d'alcool": float
}histoire_de_la_maladie = {
    "date_du_diagnostic": str,
    "Prouvé histologiquement": bool,
    "traitements_locoregionaux": {
        "intervention_1": {
            "Type d'intervention": str,
            "Nombre de cures": int,
            "Date de la dernière intervention": str,
            "Localisation": str,
            "Taille": float,
            "Information complémentaire": str
        },
        "intervention_2": {
            "Type d'intervention": str,
            "Nombre de cures": int,
            "Date de la dernière intervention": str,
            "Localisation": str,
            "Taille": float,
            "Information complémentaire": str
        },
        "intervention_3": {
            "Type d'intervention": str,
            "Nombre de cures": int,
            "Date de la dernière intervention": str,
            "Localisation": str,
            "Taille": float,
            "Information complémentaire": str
        }
    },
    "traitements_systémiques": [
        {
            "Nombre de lignes": int,
            "traitements": [
                {
                    "type": str,
                    "Date de début": str,
                    "Date de fin": str,
                    "Motif arrêt": str
                },
                {
                    "type": str,
                    "Date de début": str,
                    "Date de fin": str,
                    "Motif arrêt": str
                },
                {
                    "type": str,
                    "Date de début": str,
                    "Date de fin": str,
                    "Motif arrêt": str
                }
            ],
            "Soins de confort": bool,
            "Si oui, préciser": str
        }
    ]
}imc = {
    "IMC": float,
    "ECOG": float
  }
  
]


In [12]:
#### Test OCR and Checkbox 2 

import cv2
import pytesseract
import fitz  # PyMuPDF

# Configure Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Define a function to process a single page
def process_pdf_page(page):
    # Convert the PDF page to a PIL image
    pix = page.get_pixmap()
    
    # Convert the Pixmap to a NumPy array
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)

    # Convert from RGB (PyMuPDF's default) to BGR for OpenCV
    img_cv = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
    
    # Threshold the image to get binary image
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
    
    # Find contours (this should include checkboxes)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Loop through contours to find checkboxes and extract text
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # Determine the region where the checkbox and text should be
        checkbox_roi = thresh[y:y+h, x:x+w]
        
        # Check if the checkbox is marked based on pixel intensity
        if cv2.countNonZero(checkbox_roi) > (w * h * 0.5):
            is_checked = True
        else:
            is_checked = False
        
        # Extract the text region next to the checkbox
        text_roi = gray[y:y+h, x+w:x+w*4]  # Adjust the width multiplier as needed
        text = pytesseract.image_to_string(text_roi, config='--psm 6')
        
        # Print the status of checkbox and the extracted text
        print(f"Checkbox checked: {is_checked}, Text: {text}")

# Open the PDF file with PyMuPDF
pdf_document = fitz.open('test_0.pdf')

# Process each page
for page_num in range(len(pdf_document)):
    page = pdf_document[page_num]
    process_pdf_page(page)

pdf_document.close()


Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: False, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text

KeyboardInterrupt: 

In [6]:
import PyPDF2

# Open the PDF file
with open("./test_0.pdf", "rb") as file:
    reader = PyPDF2.PdfReader(file)
    
    # Get all form fields in the PDF
    fields = reader.get_fields()
    print(fields)
    # Optionally, print all fields
    for name, field in fields:
        print(f"Field Name: {name}")
        print(f"Field Details: {field}")


None


TypeError: 'NoneType' object is not iterable

In [12]:
with open("test_0.pdf", "rb") as file:
    reader = PyPDF2.PdfReader(file)
    fields = reader.get_fields()

    if fields is not None:
        for name, field in fields.items():
            if field.get('/FT') == '/Btn':  # Checkboxes have the field type '/Btn'
                if field.get('/V') == '/Yes':  # The value '/Yes' often indicates a checked checkbox
                    print(f"Checkbox {name} is checked.")
                else:
                    print(f"Checkbox {name} is not checked.")
            else:
                print(f"Field Name: {name}")
                print(f"Field Details: {field}")
    else:
        print("No fields found or get_fields() returned None.")


No fields found or get_fields() returned None.


In [16]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal

for page_layout in extract_pages("test_0.pdf"):
    for element in page_layout:
        if isinstance(element, LTTextBoxHorizontal):
            print(element.get_text())


ModuleNotFoundError: No module named 'pdfminer.high_level'

In [21]:
import fitz  # PyMuPDF

print("Opening the PDF file...")
pdf_document = fitz.open("test_0.pdf")

print(f"The document has {len(pdf_document)} pages.")
# Iterate through each page
for current_page in range(len(pdf_document)):
    print(f"Processing page {current_page}...")
    page = pdf_document[current_page]
    
    # Access the form's widget annotations (where form data is stored)
    widgets = page.widgets()
    if widgets:  # Check if there are any widgets on the current page
        print(f"Found {(widgets)} widgets on page {current_page}.")
        for widget in widgets:
            info = widget.field_info
            if widget.field_type == fitz.PDF_WIDGET_TYPE_CHECKBOX:
                print(f"Checkbox found: {info}")
            else:
                print(f"Form field found: {info}")
    else:
        print(f"No widgets found on page {current_page}.")

# Close the PDF document
pdf_document.close()
print("Closed the PDF file.")


Opening the PDF file...
The document has 3 pages.
Processing page 0...
Found <generator object Page.widgets at 0x000002366A18E890> widgets on page 0.
Processing page 1...
Found <generator object Page.widgets at 0x000002366A246DD0> widgets on page 1.
Processing page 2...
Found <generator object Page.widgets at 0x000002366A18E890> widgets on page 2.
Closed the PDF file.


In [20]:
!pip install --upgrade pymupdf



In [None]:
counter_total_line = 0 
counter_barz = 0 
distance_stop_hauteur, distance_stop_longueur = 0 , 0 



In [23]:
reader = PyPDF2.PdfReader("test_0.pdf")
cnt = len(reader.pages)
print("reading pdf (%d pages)" % cnt)
page = reader.pages[cnt-1]
lines = page.extract_text().splitlines()
print("%d lines extracted..." % len(lines))
reader.get_fields() # returns None
reader.get_form_text_fields() # returns None

reading pdf (3 pages)
61 lines extracted...


{}

In [31]:
# Import PDF reader library
from PyPDF2 import PdfReader# Open the PDF file
with open("test.pdf", "rb") as file:
    # Create a PDF file reader
    reader = PdfReader(file)
    # Get the number of pages
    num_pages = len(reader.pages)
    # Print the number of pages
    print(num_pages)
    # Get the first page
    page = reader.pages[0]
    # Extract the text from the first page
    text = page.extract_text()
    # Print the text
    print(text)

5
F I C H E
R C P
C H C
D E
.
CHU
de
paris
A
compléter
et
à
envoyer
à
avec
si
possible
un
bilan
biologique
récent
(de
moins
de
4
semaines),
et
une
imagerie
de
moins
de
6
semaines
Présentation
en
RCP
le
mardi
suivant
l’envoi
de
la
fiche
D a t e
d e
l a
R C P
:
… … … … … … . . … … …
N o m
d u
r e s p o n s a b l e
d e
l a
R C P
:
… … … … … … … … … … … … … … … … … … … … .
M e m b r e s
d e
l a
R C P
( c o c h e r
l a
c a s e
s i
l e
m e m b r e
e s t
p r é s e n t )
H é p a t o
g a s t r o e n t é r o l o g u e ( s )
:
☑
… … … … … … … … … … … … …
☑
… … … … … … … … … … … … … 
O n c o l o g u e ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … … 
R a d i o l o g u e ( s )
i n t e r v e n t i o n n e l ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … … 
C h i r u r g i e n ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … …
R a d i o t h é r a p e u t e ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … … 
A n a t o m o p a t h o l o g i s t e 

In [4]:
from PyPDF2 import PdfReader

# Open the PDF file
with open("test_final.pdf", "rb") as file:
    # Create a PDF file reader
    reader = PdfReader(file)
    # Get the number of pages
    num_pages = len(reader.pages)
    # Print the number of pages
    print(f"Number of pages: {num_pages}")

    # Loop through all the pages
    for i in range(num_pages):
        # Get the page
        page = reader.pages[i]
        # Extract the text from the page
        text = page.extract_text()
        # Print the text
        print(f"Text from page {i+1}:")
        print(text)
        print("\n" + "="*40 + "\n")  # Add a separator between the pages


Number of pages: 5
Text from page 1:
F I C H E
R C P
C H C
D E
.
CHU
de
paris
A
compléter
et
à
envoyer
à
avec
si
possible
un
bilan
biologique
récent
(de
moins
de
4
semaines),
et
une
imagerie
de
moins
de
6
semaines
Présentation
en
RCP
le
mardi
suivant
l’envoi
de
la
fiche
D a t e
d e
l a
R C P
:
2 4 / 0 5 / 1 9 5 0 … …
N o m
d u
r e s p o n s a b l e
d e
l a
R C P
:
… J e a n
P i e r r e … … … … .
M e m b r e s
d e
l a
R C P
( c o c h e r
l a
c a s e
s i
l e
m e m b r e
e s t
p r é s e n t )
H é p a t o
g a s t r o e n t é r o l o g u e ( s )
:
☑
P i e r r e … … … … … …
☑
T a l i s s a … … … … … … … … 
O n c o l o g u e ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … … 
R a d i o l o g u e ( s )
i n t e r v e n t i o n n e l ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … … 
C h i r u r g i e n ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … …
R a d i o t h é r a p e u t e ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … … 
A n a t o

In [42]:
import re
from PyPDF2 import PdfReader

# Open the PDF file
with open("test_final_2.pdf", "rb") as file:
    reader = PdfReader(file)

    # Initialize an empty string to store the concatenated text from all pages
    all_text = ""

    # Loop through all the pages
    for page in reader.pages:
        # Extract the text from the page
        text = page.extract_text()

        # Concatenate the text from this page to the overall string
        all_text += text

# Example regex check: Looking for any digits, simplifying to check functionality.
print(all_text)
matches = re.findall(r"\d+", all_text)
if matches:
    print("Numbers found:", matches)
else:
    print("No numbers found.")

F I C H E
R C P
C H C
D E
.
CHU
de
paris
A
compléter
et
à
envoyer
à
avec
si
possible
un
bilan
biologique
récent
(de
moins
de
4
semaines),
et
une
imagerie
de
moins
de
6
semaines
Présentation
en
RCP
le
mardi
suivant
l’envoi
de
la
fiche
D a t e
d e
l a
R C P
:
2 4 / 0 5 / 1 9 5 0 … …
N o m
d u
r e s p o n s a b l e
d e
l a
R C P
:
… J e a n
P i e r r e … … … … .
M e m b r e s
d e
l a
R C P
( c o c h e r
l a
c a s e
s i
l e
m e m b r e
e s t
p r é s e n t )
H é p a t o
g a s t r o e n t é r o l o g u e ( s )
: 
R a d i o t h é r a p e u t e ( s )
:
☑
P i e r r e … … … … … …
❑
… … … … … … … … … … … … …
☑
T a l i s s a … … … … … … … …
❑
… … … … … … … … … … … … …
O n c o l o g u e ( s )
: 
A n a t o m o p a t h o l o g i s t e ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … …
R a d i o l o g u e ( s )
i n t e r v e n t i o n n e l ( s )
: 
R a d i o l o g u e ( s )
d i a g n o s t i q u e ( s )
:
❑
… … … … … … … … … … … … …

In [27]:
date_rcp_pattern = r"D\s*a\s*t\s*e\s*d\s*e\s*l\s*a\s*R\s*C\s*P\s*:\s*([\d\s]+/\s*[\d\s]+/\s*[\d\s]+)"
nom_responsable_pattern = r"N\s*o\s*m\s*d\s*u\s*r\s*e\s*s\s*p\s*o\s*n\s*s\s*a\s*b\s*l\s*e\s*d\s*e\s*l\s*a\s*R\s*C\s*P\s*:\s*([A-Za-z\s\.]+)\.\s*\.\s*\."
sexe_pattern = r"S\s*e\s*x\s*e\s*M\s*/\s*F\s*/\s*A\s*:\s*([MFA])"
date_naissance_pattern = r"D\s*a\s*t\s*e\s*d\s*e\s*n\s*a\s*i\s*s\s*s\s*a\s*n\s*c\s*e\s*:\s*([\d\s]+-[\d\s]+-[\d\s]+)"

# Search for matches using regex patterns
date_rcp_match = re.search(date_rcp_pattern, all_text)
nom_responsable_match = re.search(nom_responsable_pattern, all_text)
sexe_match = re.search(sexe_pattern, all_text)
date_naissance_match = re.search(date_naissance_pattern, all_text)

# Extracting and cleaning up the results
date_rcp = date_rcp_match.group(1).replace(" ", "") if date_rcp_match else None
nom_responsable = ' '.join(nom_responsable_match.group(1).split()) if nom_responsable_match else None
sexe = sexe_match.group(1) if sexe_match else None
date_naissance = date_naissance_match.group(1).replace(" ", "") if date_naissance_match else None

print(f"Date RCP: {date_rcp}")
print(f"Nom du responsable: {nom_responsable}")
print(f"Sexe: {sexe}")
print(f"Date de naissance: {date_naissance}")

Date RCP: 24/05/1950
Nom du responsable: None
Sexe: M
Date de naissance: 02-08-1982
02


In [43]:
print(all_text)

F I C H E
R C P
C H C
D E
.
CHU
de
paris
A
compléter
et
à
envoyer
à
avec
si
possible
un
bilan
biologique
récent
(de
moins
de
4
semaines),
et
une
imagerie
de
moins
de
6
semaines
Présentation
en
RCP
le
mardi
suivant
l’envoi
de
la
fiche
D a t e
d e
l a
R C P
:
2 4 / 0 5 / 1 9 5 0 … …
N o m
d u
r e s p o n s a b l e
d e
l a
R C P
:
… J e a n
P i e r r e … … … … .
M e m b r e s
d e
l a
R C P
( c o c h e r
l a
c a s e
s i
l e
m e m b r e
e s t
p r é s e n t )
H é p a t o
g a s t r o e n t é r o l o g u e ( s )
: 
R a d i o t h é r a p e u t e ( s )
:
☑
P i e r r e … … … … … …
❑
… … … … … … … … … … … … …
☑
T a l i s s a … … … … … … … …
❑
… … … … … … … … … … … … …
O n c o l o g u e ( s )
: 
A n a t o m o p a t h o l o g i s t e ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … …
R a d i o l o g u e ( s )
i n t e r v e n t i o n n e l ( s )
: 
R a d i o l o g u e ( s )
d i a g n o s t i q u e ( s )
:
❑
… … … … … … … … … … … … …

In [44]:
# All pattern
date_rcp_pattern = r"D\s*a\s*t\s*e\s*d\s*e\s*l\s*a\s*R\s*C\s*P\s*:\s*([\d\s]+/\s*[\d\s]+/\s*[\d\s]+)"
nom_responsable_pattern = r"N\s*o\s*m\s*d\s*u\s*r\s*e\s*s\s*p\s*o\s*n\s*s\s*a\s*b\s*l\s*e\s*d\s*e\s*l\s*a\s*R\s*C\s*P\s*:\s*([A-Za-z\s\.]+)\.\s*\.\s*\."
sexe_pattern = r"S\s*e\s*x\s*e\s*M\s*/\s*F\s*/\s*A\s*:\s*([MFA])"
date_naissance_pattern = r"D\s*a\s*t\s*e\s*d\s*e\s*n\s*a\s*i\s*s\s*s\s*a\s*n\s*c\s*e\s*:\s*([\d\s]+-[\d\s]+-[\d\s]+)"
# dossier_passe_RCP_pattern = r"Le patient doit être informé que son dossier va passer en RCP\s*:\s*(☑)"
# cirrhose_documentee_pattern = r"Cirrhose documentée\s*:\s*(☑ oui|☑ non)"
# gastroscopie_date_pattern = r"Gastroscopie\s*:\s*❑ oui, date de la dernière\s*:\s*([\d]{2}-[\d]{2}-[\d]{4})"
# gastroscopie_absent_pattern = r"Gastroscopie\s*:\s*(☑ non)"
# varices_present_pattern = r"Varices œsophagiennes/gastriques\s*:\s*❑ Présentes, grade des VO\s*:\s*(.+)"
# ascite_level_pattern = r"Ascite\s*:\s*(☑ Absente|☑ Modérée|☑ Abondante)"
# encephalopathy_diagnosis_date_pattern = r"Encéphalopathie\s*:\s*☑ oui\s*Si oui, date du diagnostic\s*:\s*([\d]{2}/[\d]{2}/[\d]{4})"
# alcohol_consumption_current_pattern = r"NON, consommation actuelle d'alcool\s*:\s*([\d\.]+)\s*g/jour"
# treatment_type_pattern = r"Type d’intervention\s*:\s*([\w\s]+)"
# number_of_cures_pattern = r"Nombre de cures\s*:\s*(\d+)"
# last_intervention_date_pattern = r"Date de la dernière intervention\s*:\s*([\d]{2}-[\d]{2}-[\d]{4})"
# location_of_intervention_pattern = r"Localisation\s*:\s*([\w\s]+)"
# size_of_tumor_pattern = r"Taille\s*:\s*([\d\.]+)\s*cm"
# additional_info_pattern = r"Information complémentaire\s*:\s*([\w\s]+)"
# bilirubine_total_pattern = r"Bilirubine totale\s*:\s*([\d\.]+)\s*µmol/L"
# creatinine_level_pattern = r"Créatinine\s*:\s*([\d\.]+)\s*µmol/L"
# albumin_level_pattern = r"Albumine\s*:\s*([\d\.]+)\s*g/L"
# platelets_count_pattern = r"Plaquettes\s*:\s*([\d\.]+)\s*G/L"
# prothrombin_time_pattern = r"TP\s*:\s*([\d\.]+)%"
# factor_v_level_pattern = r"Facteur V\s*:\s*([\d\.]+)%"
# alpha_foetoprotein_level_pattern = r"Alpha-foetoprotéine\s*:\s*([\d\.]+)\s*ng/mL"
# Define regex patterns using spaced and complex format for robust matching
dossier_passe_RCP_pattern = r"L\s*e\s* \s*p\s*a\s*t\s*i\s*e\s*n\s*t\s* \s*d\s*o\s*i\s*t\s* \s*ê\s*t\s*r\s*e\s* \s*i\s*n\s*f\s*o\s*r\s*m\s*é\s* \s*q\s*u\s*e\s* \s*s\s*o\s*n\s* \s*d\s*o\s*s\s*s\s*i\s*e\s*r\s* \s*v\s*a\s* \s*p\s*a\s*s\s*s\s*e\s*r\s* \s*e\s*n\s* \s*R\s*C\s*P\s*:\s*(☑)"
cirrhose_documentee_pattern = r"C\s*i\s*r\s*r\s*h\s*o\s*s\s*e\s* \s*d\s*o\s*c\s*u\s*m\s*e\s*n\s*t\s*é\s*e\s*:\s*(☑\s* \s*o\s*u\s*i\s*|\s*☑\s* \s*n\s*o\s*n)"
gastroscopie_date_pattern = r"G\s*a\s*s\s*t\s*r\s*o\s*s\s*c\s*o\s*p\s*i\s*e\s*:\s*❑\s* \s*o\s*u\s*i\s*,\s* \s*d\s*a\s*t\s*e\s* \s*d\s*e\s* \s*l\s*a\s* \s*d\s*e\s*r\s*n\s*i\s*è\s*r\s*e\s*:\s*([\d\s]+-\s*[\d\s]+-\s*[\d\s]+)"
gastroscopie_absent_pattern = r"G\s*a\s*s\s*t\s*r\s*o\s*s\s*c\s*o\s*p\s*i\s*e\s*:\s*(☑\s* \s*n\s*o\s*n)"
varices_present_pattern = r"V\s*a\s*r\s*i\s*c\s*e\s*s\s* \s*œ\s*s\s*o\s*p\s*h\s*a\s*g\s*i\s*e\s*n\s*n\s*e\s*s\s*/\s*g\s*a\s*s\s*t\s*r\s*i\s*q\s*u\s*e\s*s\s*:\s*❑\s* \s*P\s*r\s*é\s*s\s*e\s*n\s*t\s*e\s*s\s*,\s* \s*g\s*r\s*a\s*d\s*e\s* \s*d\s*e\s*s\s* \s*V\s*O\s*:\s*(.+)"
ascite_level_pattern = r"A\s*s\s*c\s*i\s*t\s*e\s*:\s*(☑\s* \s*A\s*b\s*s\s*e\s*n\s*t\s*e\s*|\s*☑\s* \s*M\s*o\s*d\s*é\s*r\s*é\s*e\s*|\s*☑\s* \s*A\s*b\s*o\s*n\s*d\s*a\s*n\s*t\s*e)"
encephalopathy_diagnosis_date_pattern = r"E\s*n\s*c\s*é\s*p\s*h\s*a\s*l\s*o\s*p\s*a\s*t\s*h\s*i\s*e\s*:\s*☑\s* \s*o\s*u\s*i\s*\s*S\s*i\s* \s*o\s*u\s*i\s*,\s* \s*d\s*a\s*t\s*e\s* \s*d\s*u\s* \s*d\s*i\s*a\s*g\s*n\s*o\s*s\s*t\s*i\s*c\s*:\s*([\d\s]+/\s*[\d\s]+/\s*[\d\s]+)"
alcohol_consumption_current_pattern = r"C\s*o\s*n\s*s\s*o\s*m\s*m\s*a\s*t\s*i\s*o\s*n\s* \s*a\s*l\s*c\s*o\s*o\s*l\s*:\s*☑\s* \s*o\s*u\s*i\s*,\s* \s*é\s*v\s*a\s*l\s*u\s*a\s*t\s*i\s*o\s*n\s* \s*\d*\s* \s*(j\s*o\s*u\s*r\s*s\s*/\s*s\s*e\s*m\s*a\s*i\s*n\s*e\s*s\s*/\s\s*o\s*u\s*i\s*n\s*e\s*e\s*n\s*n\s*e\s*s\s*)\s* \s*p\s*a\s*r\s* \s\s*s\s*e\s*m\s*a\s*i\s*n\s*e\s*s\s* \s*à\s* \s*2\s* \s*v\s*o\s*e\s*r\s*r\s*e\s*s\s* \s*p\s*a\s*r\s* \s\s*s\s*e\s*m\s*a\s*i\s*n\s*e\s*s\s*/\s*d\s*e\s*s\s*s\s*i\s*n\s*e\s*s\s*t\s*i\s*o\s*n\s*/\s*\d*\s* \s*(l\s*i\s*t\s*r\s*e\s*s\s* \s*p\s*a\s*r\s* \s\s*s\s*e\s*m\s*a\s*i\s*n\s*e\s*s\s*/\s*d\s*e\s*s\s*s\s*i\s*n\s*e\s*s\s*t\s*i\s*o\s*n\s*/\s*m\s*o\s*i\s*s\s*)\s*\."
alcohol_consumption_past_pattern = r"C\s*o\s*n\s*s\s*o\s*m\s*m\s*a\s*t\s*i\s*o\s*n\s* \s*a\s*l\s*c\s*o\s*o\s*l\s*:\s*☑\s* \s*n\s*o\s*n"
treatment_type_pattern = r"T\s*y\s*p\s*e\s* \s*d\s*'\s*i\s*n\s*t\s*e\s*r\s*v\s*e\s*n\s*t\s*i\s*o\s*n\s*:\s*([\w\s]+)"
number_of_cures_pattern = r"N\s*o\s*m\s*b\s*r\s*e\s* \s*d\s*e\s* \s*c\s*u\s*r\s*e\s*s\s*:\s*(\d+)"
last_intervention_date_pattern = r"D\s*a\s*t\s*e\s* \s*d\s*e\s* \s*l\s*a\s* \s*d\s*e\s*r\s*n\s*i\s*è\s*r\s*e\s* \s*i\s*n\s*t\s*e\s*r\s*v\s*e\s*n\s*t\s*i\s*o\s*n\s*:\s*([\d\s]+-\s*[\d\s]+-\s*[\d\s]+)"
location_of_intervention_pattern = r"L\s*o\s*c\s*a\s*l\s*i\s*s\s*a\s*t\s*i\s*o\s*n\s*:\s*([\w\s]+)"
size_of_tumor_pattern = r"T\s*a\s*i\s*l\s*l\s*e\s*:\s*([\d\s]*\.\s*[\d\s]+)\s* \s*c\s*m"
additional_info_pattern = r"I\s*n\s*f\s*o\s*r\s*m\s*a\s*t\s*i\s*o\s*n\s* \s*c\s*o\s*m\s*p\s*l\s*é\s*m\s*e\s*n\s*t\s*a\s*i\s*r\s*e\s*:\s*([\w\s]+)"
bilirubine_total_pattern = r"B\s*i\s*l\s*i\s*r\s*u\s*b\s*i\s*n\s*e\s* \s*t\s*o\s*t\s*a\s*l\s*e\s*:\s*([\d\s]*\.\s*[\d\s]+)\s* \s*µ\s*m\s*o\s*l\s*/\s*L"
creatinine_level_pattern = r"C\s*r\s*é\s*a\s*t\s*i\s*n\s*i\s*n\s*e\s*:\s*([\d\s]*\.\s*[\d\s]+)\s* \s*µ\s*m\s*o\s*l\s*/\s*L"
albumin_level_pattern = r"A\s*l\s*b\s*u\s*m\s*i\s*n\s*e\s*:\s*([\d\s]*\.\s*[\d\s]+)\s* \s*g\s*/\s*L"
platelets_count_pattern = r"P\s*l\s*a\s*q\s*u\s*e\s*t\s*t\s*e\s*s\s*:\s*([\d\s]*\.\s*[\d\s]+)\s* \s*G\s*/\s*L"
prothrombin_time_pattern = r"T\s*P\s*:\s*([\d\s]*\.\s*[\d\s]+)%"
factor_v_level_pattern = r"F\s*a\s*c\s*t\s*e\s*u\s*r\s* \s*V\s*:\s*([\d\s]*\.\s*[\d\s]+)%"
alpha_foetoprotein_level_pattern = r"A\s*l\s*p\s*h\s*a\s*-\s*f\s*o\s*e\s*t\s*o\s*p\s*r\s*o\s*t\s*e\s*i\s*n\s*e\s*:\s*([\d\s]*\.\s*[\d\s]+)\s* \s*n\s*g\s*/\s*m\s*L"



In [45]:
date_rcp_match = re.search(date_rcp_pattern, all_text)
nom_responsable_match = re.search(nom_responsable_pattern, all_text)
sexe_match = re.search(sexe_pattern, all_text)
date_naissance_match = re.search(date_naissance_pattern, all_text)
dossier_passe_RCP_match = re.search(dossier_passe_RCP_pattern, all_text)
cirrhose_documentee_match = re.search(cirrhose_documentee_pattern, all_text)
gastroscopie_date_match = re.search(gastroscopie_date_pattern, all_text)
gastroscopie_absent_match = re.search(gastroscopie_absent_pattern, all_text)
varices_present_match = re.search(varices_present_pattern, all_text)
ascite_level_match = re.search(ascite_level_pattern, all_text)
encephalopathy_diagnosis_date_match = re.search(encephalopathy_diagnosis_date_pattern, all_text)
alcohol_consumption_current_match = re.search(alcohol_consumption_current_pattern, all_text)
treatment_type_match = re.search(treatment_type_pattern, all_text)
number_of_cures_match = re.search(number_of_cures_pattern, all_text)
last_intervention_date_match = re.search(last_intervention_date_pattern, all_text)
location_of_intervention_match = re.search(location_of_intervention_pattern, all_text)
size_of_tumor_match = re.search(size_of_tumor_pattern, all_text)
additional_info_match = re.search(additional_info_pattern, all_text)
bilirubine_total_match = re.search(bilirubine_total_pattern, all_text)
creatinine_level_match = re.search(creatinine_level_pattern, all_text)
albumin_level_match = re.search(albumin_level_pattern, all_text)
platelets_count_match = re.search(platelets_count_pattern, all_text)
prothrombin_time_match = re.search(prothrombin_time_pattern, all_text)
factor_v_level_match = re.search(factor_v_level_pattern, all_text)
alpha_foetoprotein_level_match = re.search(alpha_foetoprotein_level_pattern, all_text)


In [35]:
# # Extraction et nettoyage des résultats
# date_rcp = date_rcp_match.group(1).replace(" ", "") if date_rcp_match else None
# nom_responsable = ' '.join(nom_responsable_match.group(1).split()) if nom_responsable_match else None
# sexe = sexe_match.group(1) if sexe_match else None
# date_naissance = date_naissance_match.group(1).replace(" ", "") if date_naissance_match else None

# dossier_passe_RCP = dossier_passe_RCP_match.group(1) == '☑' if dossier_passe_RCP_match else False
# cirrhose_documentee = cirrhose_documentee_match.group(1).strip() if cirrhose_documentee_match else None
# gastroscopie_date = gastroscopie_date_match.group(1).replace(" ", "") if gastroscopie_date_match else None
# gastroscopie_absent = gastroscopie_absent_match.group(1) == '☑' if gastroscopie_absent_match else False
# varices_present = varices_present_match.group(1).strip() if varices_present_match else None
# ascite_level = ascite_level_match.group(1).strip() if ascite_level_match else None
# encephalopathy_diagnosis_date = encephalopathy_diagnosis_date_match.group(1).replace(" ", "") if encephalopathy_diagnosis_date_match else None
# alcohol_consumption_current = float(alcohol_consumption_current_match.group(1).replace(" ", "")) if alcohol_consumption_current_match else None
# treatment_type = treatment_type_match.group(1).strip() if treatment_type_match else None
# number_of_cures = int(number_of_cures_match.group(1)) if number_of_cures_match else None
# last_intervention_date = last_intervention_date_match.group(1).replace(" ", "") if last_intervention_date_match else None
# location_of_intervention = location_of_intervention_match.group(1).strip() if location_of_intervention_match else None
# size_of_tumor = float(size_of_tumor_match.group(1).replace(" ", "")) if size_of_tumor_match else None
# additional_info = additional_info_match.group(1).strip() if additional_info_match else None
# bilirubine_total = float(bilirubine_total_match.group(1).replace(" ", "")) if bilirubine_total_match else None
# creatinine_level = float(creatinine_level_match.group(1).replace(" ", "")) if creatinine_level_match else None
# albumin_level = float(albumin_level_match.group(1).replace(" ", "")) if albumin_level_match else None
# platelets_count = float(platelets_count_match.group(1).replace(" ", "")) if platelets_count_match else None
# prothrombin_time = float(prothrombin_time_match.group(1).replace(" ", "")) if prothrombin_time_match else None
# factor_v_level = float(factor_v_level_match.group(1).replace(" ", "")) if factor_v_level_match else None
# alpha_foetoprotein_level = float(alpha_foetoprotein_level_match.group(1).replace(" ", "")) if alpha_foetoprotein_level_match else None


In [47]:
print("Date RCP:", date_rcp_match.group(1).replace(" ", "")) if date_rcp_match else None
print("Nom responsable:", ' '.join(nom_responsable_match.group(1).split())) if nom_responsable_match else None
print("Sexe:", sexe_match.group(1)) if sexe_match else None
print("Date de naissance:", date_naissance_match.group(1).replace(" ", "")) if date_naissance_match else None
print("Dossier passé en RCP:", dossier_passe_RCP_match.group(1)) if dossier_passe_RCP_match else None
print("Cirrhose documentée:", cirrhose_documentee_match.group(1).strip()) if cirrhose_documentee_match else None
print("Date de gastroscopie:", gastroscopie_date_match.group(1).replace(" ", "")) if gastroscopie_date_match else None
print("Gastroscopie absente:", gastroscopie_absent_match.group(1)) if gastroscopie_absent_match else None
print("Varices présentes:", varices_present_match.group(1).strip()) if varices_present_match else None
print("Niveau d'ascite:", ascite_level_match.group(1).strip()) if ascite_level_match else None
print("Date de diagnostic de l'encéphalopathie:", encephalopathy_diagnosis_date_match.group(1).replace(" ", "")) if encephalopathy_diagnosis_date_match else None
print("Consommation actuelle d'alcool:", alcohol_consumption_current_match.group(1).replace(" ", "")) if alcohol_consumption_current_match else None
print("Type de traitement:", treatment_type_match.group(1).strip()) if treatment_type_match else None
print("Nombre de cures:", number_of_cures_match.group(1)) if number_of_cures_match else None
print("Date de la dernière intervention:", last_intervention_date_match.group(1).replace(" ", "")) if last_intervention_date_match else None
print("Localisation de l'intervention:", location_of_intervention_match.group(1)) if location_of_intervention_match else None
print("Taille de la tumeur:", size_of_tumor_match.group(1).strip()) if size_of_tumor_match else None
print("Information complémentaire:", additional_info_match.group(1).strip()) if additional_info_match else None
print("Bilirubine totale:", bilirubine_total_match.group(1).strip()) if bilirubine_total_match else None
print("Créatinine:", creatinine_level_match.group(1).strip()) if creatinine_level_match else None
print("Albumine:", albumin_level_match.group(1).strip()) if albumin_level_match else None
print("Plaquettes:", platelets_count_match.group(1).strip()) if platelets_count_match else None
print("Temps de prothrombine:", prothrombin_time_match.group(1).strip()) if prothrombin_time_match else None
print("Facteur V:", factor_v_level_match.group(1).strip()) if factor_v_level_match else None
print("Niveau d'alpha-foetoprotéine:", alpha_foetoprotein_level_match.group(1).strip()) if alpha_foetoprotein_level_match else None


Date RCP: 24/05/1950
Sexe: M
Date de naissance: 02-08-1982
02
Localisation de l'intervention: P a r i s
T a i l l e

Niveau d'alpha-foetoprotéine: None
