In [8]:
import fitz  # PyMuPDF
import cv2
import numpy as np
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def convert_to_grayscale(image):
    if len(image.shape) == 2 or image.shape[2] == 1:
        return image
    elif image.shape[2] == 3:
        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    elif image.shape[2] == 4:
        return cv2.cvtColor(image, cv2.COLOR_BGRA2GRAY)

def count_pixels_below_value(image, value):
    return np.sum(image < value)

def is_checkbox_marked(checkbox_region, threshold=0.5, dark_pixel_value=128):
    gray_region = convert_to_grayscale(checkbox_region)
    total_pixels = gray_region.size
    dark_pixels_count = count_pixels_below_value(gray_region, dark_pixel_value)
    dark_pixels_percentage = dark_pixels_count / total_pixels
    return dark_pixels_percentage >= threshold

def process_image(image, page):
    gray = convert_to_grayscale(image)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)

    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Define minimum and maximum dimensions for a checkbox
    min_dim = 8  # Minimum dimension for a checkbox to be considered
    max_dim = 15  # Maximum dimension for a checkbox to be considered
    min_aspect_ratio = 0.8  # Minimum aspect ratio to consider as checkbox
    max_aspect_ratio = 1.2  # Maximum aspect ratio to consider as checkbox

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # Filter out non-checkbox contours by dimension and aspect ratio
        if w < min_dim or h < min_dim or w > max_dim or h > max_dim:
            continue
        if not min_aspect_ratio <= (w / h) <= max_aspect_ratio:
            continue
        checkbox_region = gray[y:y+h, x:x+w]
        marked = is_checkbox_marked(checkbox_region)
        text_x, text_y = x + w, y + h // 2  # Adjust as needed for text placement
        if marked:
            # Insert a checked box Unicode character
            page.insert_text((text_x, text_y), "box", fontsize=11, color=(0, 0, 1))
        else:
            # Insert an unchecked box Unicode character
            page.insert_text((text_x, text_y), "no box", fontsize=11, color=(1, 0, 0))

pdf_path = 'test_0.pdf'
pdf_document = fitz.open(pdf_path)

for page_number in range(len(pdf_document)):
    page = pdf_document.load_page(page_number)
    zoom = 2  # A zoom factor of 2 improves the quality of the image
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat)
    image_np = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
    image_np = cv2.cvtColor(image_np, cv2.COLOR_RGBA2BGR if pix.n == 4 else cv2.COLOR_RGB2BGR)
    process_image(image_np, page)

pdf_document.save("annotated_output.pdf")
pdf_document.close()


In [None]:
[
  {
    "titre": {
      "nom_etablissement": "string"
    },
    "date": {
      "date_rcp": "string",
      "nom_responsable": "string",
      "membres": {
        "hepatogastroenterologues": [],
        "oncologues": [],
        "radiologues_interventionnels": [],
        "chirurgiens": [],
        "radiotherapeutes": [],
        "anatomopathologiste": [],
        "radiologue_diagnostique": [],
        "infirmiers": []
      },
      "patient": {
        "sexe": "0,1,2",
        "date_naissance": "string",
        "coordonnees_medecin_traitant": "string",
        "coordonnees_medecin_adresseur": "string",
        "dossier_passe_RCP": "bool"
      },
      "motif": {
        "1ere_presentation": "bool",
        "dossier_deja_discute_le": "string",
        "motif_presentation": {
          "decision_traitement": "bool",
          "avis_diagnostique": "bool",
          "ajustement_therapeutique": "bool",
          "surveillance_post_traitement": "bool"
        }
      },
      "atcd": {
        "diabete_cardiovasculaire_comorbidite": "string",
        "traitement_courant": "string"
      }
    },
    "Donnees_Biologique_moins_de_4_semaines": {
      "Bilirubine_totale(µmol/L)": "float",
      "Creatinine(µmol/L)": "float",
      "Albumine(g/L)": "float",
      "Plaquettes(G/L)": "float",
      "TP": {
        "pourcentage": "float",
        "AVK_ou_AOD": "bool"
      },
      "Facteur_V(%)": "float",
      "Alpha-foetoprotéine(ng/mL)": "float"
    },
    "Souhait_prise_en_charge_post_rcp": {
      "accord": "bool",
      "souhait_patient": "str",
      "eligibilite": "bool"
    },
    "Information_patient_post_RCP": {
      "accord_pour_prévenir_prise_en_charge": "bool",
      "comment_informer": "bool"
    },
    "Decision_prise_en_charge": {
      "Transplantation": "bool",
      "Radio_embolisation": "bool",
      "Exerese_Chirurgie": {
        "value": "bool",
        "précision": "str"
      },
      "Chimio_Embolisation": "bool",
      "Traitement_systémique": "bool",
      "Essaie_thérapeutique": "bool",
      "Radiothérapie": "bool",
      "Soins_de_confort": "bool",
      "Precision/hiearchisation_si_propositions": "string"
    }
  "Cirrhose documentee": False,
    "arguments": {
        "clinique": False,
        "marqueurs non-invasifs": False,
        "imagerie": False,
        "biopsie": False
    }
}gastroscopie = {
    "oui, date de la dernière": str,
    "non": False,
    "Varices_oesophagiennes_gastriques": {
        "Non recherchées": False,
        "Absentes": False,
        "Présentes, grade des VO:": str,
    }
}ascite = {
    "Absente": bool,
    "Modérée": bool,
    "Abondante": bool,
    "Pas d'ascite clinique mais visible en imagerie": bool,
    "Pas d'ascite actuellement mais ATCD d'ascite": bool
}encephalopathie = {
    "oui": bool,
    "non": bool,
    "Si oui, date du diagnostic": str
}alcool = {
    "OUI, depuis combien de temps": str,
    "NON, consommation actuelle d'alcool": float
}histoire_de_la_maladie = {
    "date_du_diagnostic": str,
    "Prouvé histologiquement": bool,
    "traitements_locoregionaux": {
        "intervention_1": {
            "Type d'intervention": str,
            "Nombre de cures": int,
            "Date de la dernière intervention": str,
            "Localisation": str,
            "Taille": float,
            "Information complémentaire": str
        },
        "intervention_2": {
            "Type d'intervention": str,
            "Nombre de cures": int,
            "Date de la dernière intervention": str,
            "Localisation": str,
            "Taille": float,
            "Information complémentaire": str
        },
        "intervention_3": {
            "Type d'intervention": str,
            "Nombre de cures": int,
            "Date de la dernière intervention": str,
            "Localisation": str,
            "Taille": float,
            "Information complémentaire": str
        }
    },
    "traitements_systémiques": [
        {
            "Nombre de lignes": int,
            "traitements": [
                {
                    "type": str,
                    "Date de début": str,
                    "Date de fin": str,
                    "Motif arrêt": str
                },
                {
                    "type": str,
                    "Date de début": str,
                    "Date de fin": str,
                    "Motif arrêt": str
                },
                {
                    "type": str,
                    "Date de début": str,
                    "Date de fin": str,
                    "Motif arrêt": str
                }
            ],
            "Soins de confort": bool,
            "Si oui, préciser": str
        }
    ]
}imc = {
    "IMC": float,
    "ECOG": float
  }
  
]


In [12]:
#### Test OCR and Checkbox 2 

import cv2
import pytesseract
import fitz  # PyMuPDF

# Configure Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Define a function to process a single page
def process_pdf_page(page):
    # Convert the PDF page to a PIL image
    pix = page.get_pixmap()
    
    # Convert the Pixmap to a NumPy array
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)

    # Convert from RGB (PyMuPDF's default) to BGR for OpenCV
    img_cv = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
    
    # Threshold the image to get binary image
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
    
    # Find contours (this should include checkboxes)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Loop through contours to find checkboxes and extract text
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        # Determine the region where the checkbox and text should be
        checkbox_roi = thresh[y:y+h, x:x+w]
        
        # Check if the checkbox is marked based on pixel intensity
        if cv2.countNonZero(checkbox_roi) > (w * h * 0.5):
            is_checked = True
        else:
            is_checked = False
        
        # Extract the text region next to the checkbox
        text_roi = gray[y:y+h, x+w:x+w*4]  # Adjust the width multiplier as needed
        text = pytesseract.image_to_string(text_roi, config='--psm 6')
        
        # Print the status of checkbox and the extracted text
        print(f"Checkbox checked: {is_checked}, Text: {text}")

# Open the PDF file with PyMuPDF
pdf_document = fitz.open('test_0.pdf')

# Process each page
for page_num in range(len(pdf_document)):
    page = pdf_document[page_num]
    process_pdf_page(page)

pdf_document.close()


Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text: 
Checkbox checked: False, Text: 
Checkbox checked: False, Text: 
Checkbox checked: True, Text: 
Checkbox checked: False, Text

KeyboardInterrupt: 

In [6]:
import PyPDF2

# Open the PDF file
with open("./test_0.pdf", "rb") as file:
    reader = PyPDF2.PdfReader(file)
    
    # Get all form fields in the PDF
    fields = reader.get_fields()
    print(fields)
    # Optionally, print all fields
    for name, field in fields:
        print(f"Field Name: {name}")
        print(f"Field Details: {field}")


None


TypeError: 'NoneType' object is not iterable

In [12]:
with open("test_0.pdf", "rb") as file:
    reader = PyPDF2.PdfReader(file)
    fields = reader.get_fields()

    if fields is not None:
        for name, field in fields.items():
            if field.get('/FT') == '/Btn':  # Checkboxes have the field type '/Btn'
                if field.get('/V') == '/Yes':  # The value '/Yes' often indicates a checked checkbox
                    print(f"Checkbox {name} is checked.")
                else:
                    print(f"Checkbox {name} is not checked.")
            else:
                print(f"Field Name: {name}")
                print(f"Field Details: {field}")
    else:
        print("No fields found or get_fields() returned None.")


No fields found or get_fields() returned None.


In [16]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal

for page_layout in extract_pages("test_0.pdf"):
    for element in page_layout:
        if isinstance(element, LTTextBoxHorizontal):
            print(element.get_text())


ModuleNotFoundError: No module named 'pdfminer.high_level'

In [21]:
import fitz  # PyMuPDF

print("Opening the PDF file...")
pdf_document = fitz.open("test_0.pdf")

print(f"The document has {len(pdf_document)} pages.")
# Iterate through each page
for current_page in range(len(pdf_document)):
    print(f"Processing page {current_page}...")
    page = pdf_document[current_page]
    
    # Access the form's widget annotations (where form data is stored)
    widgets = page.widgets()
    if widgets:  # Check if there are any widgets on the current page
        print(f"Found {(widgets)} widgets on page {current_page}.")
        for widget in widgets:
            info = widget.field_info
            if widget.field_type == fitz.PDF_WIDGET_TYPE_CHECKBOX:
                print(f"Checkbox found: {info}")
            else:
                print(f"Form field found: {info}")
    else:
        print(f"No widgets found on page {current_page}.")

# Close the PDF document
pdf_document.close()
print("Closed the PDF file.")


Opening the PDF file...
The document has 3 pages.
Processing page 0...
Found <generator object Page.widgets at 0x000002366A18E890> widgets on page 0.
Processing page 1...
Found <generator object Page.widgets at 0x000002366A246DD0> widgets on page 1.
Processing page 2...
Found <generator object Page.widgets at 0x000002366A18E890> widgets on page 2.
Closed the PDF file.


In [20]:
!pip install --upgrade pymupdf



In [None]:
counter_total_line = 0 
counter_barz = 0 
distance_stop_hauteur, distance_stop_longueur = 0 , 0 



In [23]:
reader = PyPDF2.PdfReader("test_0.pdf")
cnt = len(reader.pages)
print("reading pdf (%d pages)" % cnt)
page = reader.pages[cnt-1]
lines = page.extract_text().splitlines()
print("%d lines extracted..." % len(lines))
reader.get_fields() # returns None
reader.get_form_text_fields() # returns None

reading pdf (3 pages)
61 lines extracted...


{}

In [29]:
# Import PDF reader library
from PyPDF2 import PdfReader# Open the PDF file
with open("test.pdf", "rb") as file:
    # Create a PDF file reader
    reader = PdfReader(file)
    # Get the number of pages
    num_pages = len(reader.pages)
    # Print the number of pages
    print(num_pages)
    # Get the first page
    page = reader.pages[0]
    # Extract the text from the first page
    text = page.extract_text()
    # Print the text
    print(text)

5
F I C H E
R C P
C H C
D E
.
CHU
de
paris
A
compléter
et
à
envoyer
à
avec
si
possible
un
bilan
biologique
récent
(de
moins
de
4
semaines),
et
une
imagerie
de
moins
de
6
semaines
Présentation
en
RCP
le
mardi
suivant
l’envoi
de
la
fiche
D a t e
d e
l a
R C P
:
… … … … … … . . … … …
N o m
d u
r e s p o n s a b l e
d e
l a
R C P
:
… … … … … … … … … … … … … … … … … … … … .
M e m b r e s
d e
l a
R C P
( c o c h e r
l a
c a s e
s i
l e
m e m b r e
e s t
p r é s e n t )
H é p a t o
g a s t r o e n t é r o l o g u e ( s )
:
☑
… … … … … … … … … … … … …
☑
… … … … … … … … … … … … … 
O n c o l o g u e ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … … 
R a d i o l o g u e ( s )
i n t e r v e n t i o n n e l ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … … 
C h i r u r g i e n ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … …
R a d i o t h é r a p e u t e ( s )
:
❑
… … … … … … … … … … … … …
❑
… … … … … … … … … … … … … 
A n a t o m o p a t h o l o g i s t e 