# PDF Scraping

Extact all relevant data from IOL Master PDF. 

The process is:
1. Load all PDFs by version
2. Create image of PDF
3. Use OCR in Pytesseract to convert to text
4. Use defined positions to extract the desired data points

In [1]:
# import librariers
from pdf2image import convert_from_path
import pandas as pd
import pytesseract
from pathlib import Path
import os

## Functions

### List PDFs and Extract Data Points

In [3]:
# list pdf files sorted by date
def list_pdf(pdf_path):
    # returns all file paths that has .pdf as extension in the specified directory
    pdf_search = Path(pdf_path).rglob("*.pdf")

    # Create a list of tuples (modification_time, file_path as string)
    pdf_files_with_time = [(os.path.getmtime(file), str(file)) for file in pdf_search]

    # Sort the list of tuples by modification_time
    pdf_files_sorted = [file for _, file in sorted(pdf_files_with_time, key=lambda x: x[0])]

    return pdf_files_sorted


# extracts data points from one box in a df by position 
def extract_data_point(df, coord_left_start, coord_left_end, coord_top_start, coord_top_end, further_conditions=None, sentence=True):
    # find data points at defined positions. Only look at data points who are likely text (conf)
    filtered_df = df.loc[(df.left.between(coord_left_start, coord_left_end)) & 
                         (df.top.between(coord_top_start, coord_top_end)) &
                         (df['conf'] > -1)]  # Adjust conf condition as needed

    # Check if any data points were found
    if filtered_df.empty:
        return None

    # Adjust top position of multiple elements in one line, if they only slightly differ, to keep the elements in the right order of sorting
    height_tolerance = 2
    if not filtered_df[filtered_df["text"] != ""].empty:
        pos_first_text = filtered_df[filtered_df["text"] != ""].iloc[0]["top"]
        pos_last_text = filtered_df[filtered_df["text"] != ""].iloc[-1]["top"]

        # adjust line top position in first line
        filtered_df.loc[filtered_df.top.between(pos_first_text-height_tolerance, pos_first_text+height_tolerance), "top"] = pos_first_text

        # adjust line top position in last line
        filtered_df.loc[filtered_df.top.between(pos_last_text-height_tolerance, pos_last_text+height_tolerance), "top"] = pos_last_text

    # sort by top and left
    filtered_df = filtered_df.sort_values(["top", "left"])

    # return results based on the request type (full sentence or single data point)
    if sentence:
        return " ".join(filtered_df["text"]) if not filtered_df.empty else None
    else:
        # Return single data point (first found, if many)
        return filtered_df.iloc[0].to_dict() if not filtered_df.empty else None
        



def extract_data_point_backup(df, coor_left_start, coord_left_end, coord_top_start, coord_top_end, further_conditions = None, sentence = True):
    
    # find data points at defined positions. Only look at data points who are likely text (conf > -1)
    filtered_df = df.sort_values(["top", "left"]).loc[(df.left.between(coor_left_start, coord_left_end)) & (df.top.between(coord_top_start,coord_top_end)) & (df.conf > -1), "text"]

    if filtered_df.empty:
        # if no entry is found
        return None        
    else:
        # if looking for a whole sentence:
        if sentence == True:
            return filtered_df.str.cat(sep=" ")
        else:
            # return single data point (fist found, if many)
            return filtered_df.values[0]

### Extract IOL Data From PDF

#### v1.5

In [None]:
# extracts all data points for one pdf
def extract_iol_v15(file):
    # define path of tesseract
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    # convert pdf to image
    pages = convert_from_path(file)

    # use googles OCR technology in pytesseract to extract text from images
    d = pytesseract.image_to_data(pages[0], output_type=pytesseract.Output.DICT)
    d_df = pd.DataFrame.from_dict(d)

    ### Extract data by position

    # Patient number
    patnr = extract_data_point(d_df, 280, 320, 180, 220)

    # Examination date
    exam_date = extract_data_point(d_df, 370, 410, 430, 470)

    # Comment
    comment = extract_data_point(d_df, 150, 1400, 520, 720, sentence=True)

    # lens category (monofokal vs ...)
    lens_cat = extract_data_point(d_df, 290, 320, 250, 770)

    # IOL RIGHT
    AL_R = extract_data_point(d_df, 220, 250, 880, 910)
    VKT_R = extract_data_point(d_df, 220, 250, 910, 940)
    LD_R = extract_data_point(d_df, 220, 250, 940, 965)
    R_R = extract_data_point(d_df, 220, 250, 965, 990)

    # SD of AL, VKT, LD
    AL_SD_R = extract_data_point(d_df, 460, 480, 880, 910)
    VKT_SD_R = extract_data_point(d_df, 460, 480, 910, 940)
    LD_SD_R = extract_data_point(d_df, 460, 480, 940, 965)

    # R1, R2, DeltaD
    R1_R = extract_data_point(d_df, 220, 260, 1000, 1020)
    R2_R = extract_data_point(d_df, 220, 260, 1020, 1050)
    DeltaD_R = extract_data_point(d_df, 220, 260, 1050, 1080)

    # @ of R1, R2, DeltaD
    R1_at_R = extract_data_point(d_df, 390, 410, 1000, 1020)
    R2_at_R = extract_data_point(d_df, 390, 410, 1020, 1050)
    DeltaD_at_R = extract_data_point(d_df, 390, 410, 1050, 1080)

    # WZW
    WZW_R = extract_data_point(d_df, 640, 660, 1055, 1080)

    # Refraction aim
    RA_aim_R = extract_data_point(d_df, 290, 320, 1125, 1145)

    # IOL LEFT
    AL_L = extract_data_point(d_df, 930, 960, 880, 910)
    VKT_L = extract_data_point(d_df, 930, 960, 910, 940)
    LD_L = extract_data_point(d_df, 930, 960, 940, 965)
    R_L = extract_data_point(d_df, 930, 960, 965, 990)

    # SD of AL, VKT, LD
    AL_SD_L = extract_data_point(d_df, 1170, 1190, 880, 910)
    VKT_SD_L = extract_data_point(d_df, 1170, 1190, 910, 940, sentence=True)
    LD_SD_L = extract_data_point(d_df, 1170, 1190, 940, 965)

    # R1, R2, DeltaD
    R1_L = extract_data_point(d_df, 930, 960, 1000, 1020)
    R2_L = extract_data_point(d_df, 930, 960, 1020, 1050)
    DeltaD_L = extract_data_point(d_df, 930, 960, 1050, 1080)

    # @ of R1, R2, DeltaD
    R1_at_L = extract_data_point(d_df, 1100, 1120, 1000, 1020)
    R2_at_L = extract_data_point(d_df, 1100, 1120, 1020, 1050)
    DeltaD_at_L = extract_data_point(d_df, 1100, 1120, 1050, 1080)

    # WZW
    WZW_L = extract_data_point(d_df, 1350, 1370, 1055, 1080)

    # Refraction aim
    RA_aim_L = extract_data_point(d_df, 1010, 1030, 1125, 1145)

    ### Combine to df
    # start df
    data_row = pd.DataFrame(
        {
            "filename": file,
            "patnr": patnr,
            "exam_date": exam_date,
            "comment": comment,
            "lens_cat": lens_cat,
            "AL_R": AL_R,
            "VKT_R": VKT_R,
            "LD_R": LD_R,
            "R_R": R_R,
            "AL_SD_R": AL_SD_R,
            "VKT_SD_R": VKT_SD_R,
            "LD_SD_R": LD_SD_R,
            "R1_R": R1_R,
            "R2_R": R2_R,
            "DeltaD_R": DeltaD_R,
            "R1_at_R": R1_at_R,
            "R2_at_R": R2_at_R,
            "DeltaD_at_R": DeltaD_at_R,
            "WZW_R": WZW_R,
            "RA_aim_R": RA_aim_R,
            "AL_L": AL_L,
            "VKT_L": VKT_L,
            "LD_L": LD_L,
            "R_L": R_L,
            "AL_SD_L": AL_SD_L,
            "VKT_SD_L": VKT_SD_L,
            "LD_SD_L": LD_SD_L,
            "R1_L": R1_L,
            "R2_L": R2_L,
            "DeltaD_L": DeltaD_L,
            "R1_at_L": R1_at_L,
            "R2_at_L": R2_at_L,
            "DeltaD_at_L": DeltaD_at_L,
            "WZW_L": WZW_L,
            "RA_aim_L": RA_aim_L,
        }
    , index=[0])

    return data_row


#### v.17

In [None]:
def extract_iol_v17(file):
    # define path of tesseract
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    # convert pdf to image
    pages = convert_from_path(file)

    # use googles OCR technology in pytesseract to extract text from images
    d = pytesseract.image_to_data(pages[0], output_type=pytesseract.Output.DICT)
    d_df = pd.DataFrame.from_dict(d)

    ### Extract data by position

    # Patient number
    patnr = extract_data_point(d_df, 360, 380, 220, 240)

    # Examination date
    exam_date = extract_data_point(d_df, 370, 410, 430, 470)

    # Comment
    comment = extract_data_point(d_df, 150, 1400, 520, 720, sentence=True)

    # lens category (monofokal vs ...)
    lens_cat = extract_data_point(d_df, 360, 380, 280, 310)

    # IOL RIGHT
    AL_R = extract_data_point(d_df, 215, 250, 870, 895)
    VKT_R = extract_data_point(d_df, 215, 250, 900, 920)
    LD_R = extract_data_point(d_df, 215, 250, 930, 950)
    R_R = extract_data_point(d_df, 215, 250, 955, 980)
    R1_R = extract_data_point(d_df, 215, 250, 985, 1100)

    # SD of AL, VKT, LD and @ of R1
    AL_SD_R = extract_data_point(d_df, 415, 430, 870, 890)
    VKT_SD_R = extract_data_point(d_df,370, 380, 900, 920)
    LD_SD_R = extract_data_point(d_df, 415, 440, 930, 950)
    R1_at_R = extract_data_point(d_df, 780, 800, 960, 970)

    # WZW, R2, DeltaD
    WZW_R = extract_data_point(d_df, 650, 665, 870, 890)
    DeltaD_R = extract_data_point(d_df, 635, 660, 955, 975)
    R2_R = extract_data_point(d_df, 635, 660, 985, 1010)

    # @ of R2, DeltaD
    DeltaD_at_R =extract_data_point(d_df, 780, 800, 960, 975)
    R2_at_R = extract_data_point(d_df, 780, 800, 985, 1010)

    # Refraction aim
    RA_aim_R = extract_data_point(d_df, 250, 280, 1050, 1080)



    # IOL LEFT
    AL_L = extract_data_point(d_df, 215 + 712, 250 + 712, 870, 895)
    VKT_L = extract_data_point(d_df, 215 + 712, 250 + 712, 900, 920)
    LD_L = extract_data_point(d_df, 215 + 712, 250 + 712, 930, 950)
    R_L = extract_data_point(d_df, 215 + 712, 250 + 712, 955, 980)
    R1_L = extract_data_point(d_df, 215 + 712, 250 + 712, 985, 1100)

    # SD of AL, VKT, LD and @ of R1
    AL_SD_L = extract_data_point(d_df, 415 + 712, 430 + 712, 870, 890)
    VKT_SD_L = extract_data_point(d_df, 370 + 712, 380 + 712, 900, 920)
    LD_SD_L = extract_data_point(d_df, 415 + 712, 440 + 712, 930, 950)
    R1_at_L = extract_data_point(d_df, 780 + 712, 800 + 712, 960, 970)

    # WZW, R2, DeltaD
    WZW_L = extract_data_point(d_df, 650 + 712, 665 + 712, 870, 890)
    DeltaD_L = extract_data_point(d_df, 635 + 712, 660 + 712, 955, 975)
    R2_L = extract_data_point(d_df, 635 + 712, 660 + 712, 985, 1010)

    # @ of R2, DeltaD
    DeltaD_at_L = extract_data_point(d_df, 780 + 712, 800 + 712, 960, 975)
    R2_at_L = extract_data_point(d_df, 780 + 712, 800 + 712, 985, 1010)

    # Refraction aim
    RA_aim_L = extract_data_point(d_df, 250 + 712, 280 + 712, 1050, 1080)

    ### Combine to df
    # start df
    data_row = pd.DataFrame(
        {
            "filename": file,
            "patnr": patnr,
            "exam_date": exam_date,
            "comment": comment,
            "lens_cat": lens_cat,
            "AL_R": AL_R,
            "VKT_R": VKT_R,
            "LD_R": LD_R,
            "R_R": R_R,
            "AL_SD_R": AL_SD_R,
            "VKT_SD_R": VKT_SD_R,
            "LD_SD_R": LD_SD_R,
            "R1_R": R1_R,
            "R2_R": R2_R,
            "DeltaD_R": DeltaD_R,
            "R1_at_R": R1_at_R,
            "R2_at_R": R2_at_R,
            "DeltaD_at_R": DeltaD_at_R,
            "WZW_R": WZW_R,
            "RA_aim_R": RA_aim_R,
            "AL_L": AL_L,
            "VKT_L": VKT_L,
            "LD_L": LD_L,
            "R_L": R_L,
            "AL_SD_L": AL_SD_L,
            "VKT_SD_L": VKT_SD_L,
            "LD_SD_L": LD_SD_L,
            "R1_L": R1_L,
            "R2_L": R2_L,
            "DeltaD_L": DeltaD_L,
            "R1_at_L": R1_at_L,
            "R2_at_L": R2_at_L,
            "DeltaD_at_L": DeltaD_at_L,
            "WZW_L": WZW_L,
            "RA_aim_L": RA_aim_L,
        }
    , index=[0])

    return data_row

#### v.19

In [4]:
def extract_iol_v19(file):
    # define path of tesseract
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    # convert pdf to image
    pages = convert_from_path(file)

    # use googles OCR technology in pytesseract to extract text from images
    d = pytesseract.image_to_data(pages[0], output_type=pytesseract.Output.DICT)
    d_df = pd.DataFrame.from_dict(d)

    ### Extract data by position

    # Patient number
    patnr = extract_data_point(d_df, 360, 380, 220, 240)

    # Examination date
    exam_date = extract_data_point(d_df, 370, 410, 430, 470)

    # Comment
    comment = extract_data_point(d_df, 175, 1400, 465, 600, sentence=True)

    # lens category (monofokal vs ...)
    lens_cat = extract_data_point(d_df, 360, 380, 280, 310)

    # IOL RIGHT
    AL_R = extract_data_point(d_df, 215, 250, 970, 990)
    VKT_R = extract_data_point(d_df, 215, 250, 1000, 1020)
    LD_R = extract_data_point(d_df, 215, 250, 1030, 1050)
    WZW_R = extract_data_point(d_df, 215, 250, 1065, 1085)
    R_R = extract_data_point(d_df, 215, 250, 1090, 1115)
    DeltaD_R = extract_data_point(d_df, 215, 250, 1125, 1150)   # here DeltaK =! DeltaD?!

    R1_R = extract_data_point(d_df, 630, 650, 1090, 1115)
    R2_R = extract_data_point(d_df, 630, 650, 1125, 1150)


    # SD of AL, VKT, LD, R
    AL_SD_R = extract_data_point(d_df, 530, 560, 970, 990)
    VKT_SD_R = extract_data_point(d_df, 530, 560, 1000, 1020)
    LD_SD_R = extract_data_point(d_df, 530, 560, 1030, 1050)
    R_SD_R = extract_data_point(d_df, 470, 490, 1090, 1120)

    # @ of R1, R2, DeltaD
    DeltaD_at_R = extract_data_point(d_df, 350, 390, 1125, 1155, sentence=True)
    R1_at_R = extract_data_point(d_df, 750, 810, 1090, 1115, sentence=True)
    R2_at_R = extract_data_point(d_df, 750, 810, 1125, 1150, sentence=True)

    # Refraction aim
    RA_aim_R = extract_data_point(d_df, 275, 300, 895, 915, sentence=True)



    # IOL LEFT
    AL_L = extract_data_point(d_df, 215 + 710, 250 + 710, 970, 990)
    VKT_L = extract_data_point(d_df, 215 + 710, 250 + 710, 1000, 1020)
    LD_L = extract_data_point(d_df, 215 + 710, 250 + 710, 1030, 1050)
    WZW_L = extract_data_point(d_df, 215 + 710, 250 + 710, 1065, 1085)
    R_L = extract_data_point(d_df, 215 + 710, 250 + 710, 1090, 1115)
    DeltaD_L = extract_data_point(d_df, 215 + 710, 250 + 710, 1125, 1150)  # Note: DeltaK was probably a typo; corrected to DeltaD

    R1_L = extract_data_point(d_df, 630 + 710, 650 + 710, 1090, 1115)
    R2_L = extract_data_point(d_df, 630 + 710, 650 + 710, 1125, 1150)

    # SD of AL, VKT, LD, R
    AL_SD_L = extract_data_point(d_df, 530 + 710, 560 + 710, 970, 990)
    VKT_SD_L = extract_data_point(d_df, 530 + 710, 560 + 710, 1000, 1020)
    LD_SD_L = extract_data_point(d_df, 530 + 710, 560 + 710, 1030, 1050)
    R_SD_L = extract_data_point(d_df, 470 + 710, 490 + 710, 1090, 1120)

    # @ of R1, R2, DeltaD
    DeltaD_at_L = extract_data_point(d_df, 350 + 710, 390 + 710, 1125, 1155, sentence=True)
    R1_at_L = extract_data_point(d_df, 750 + 710, 810 + 710, 1090, 1115, sentence=True)
    R2_at_L = extract_data_point(d_df, 750 + 710, 810 + 710, 1125, 1150, sentence=True)

    # Refraction aim
    RA_aim_L = extract_data_point(d_df, 275 + 710, 300 + 710, 895, 915, sentence=True)
    

    ### Combine to df
    # start df
    data_row = pd.DataFrame(
        {
            "filename": file,
            "patnr": patnr,
            "exam_date": exam_date,
            "comment": comment,
            "lens_cat": lens_cat,
            "AL_R": AL_R,
            "VKT_R": VKT_R,
            "LD_R": LD_R,
            "R_R": R_R,
            "AL_SD_R": AL_SD_R,
            "VKT_SD_R": VKT_SD_R,
            "LD_SD_R": LD_SD_R,
            "R1_R": R1_R,
            "R2_R": R2_R,
            "DeltaD_R": DeltaD_R,
            "R1_at_R": R1_at_R,
            "R2_at_R": R2_at_R,
            "DeltaD_at_R": DeltaD_at_R,
            "WZW_R": WZW_R,
            "RA_aim_R": RA_aim_R,
            "AL_L": AL_L,
            "VKT_L": VKT_L,
            "LD_L": LD_L,
            "R_L": R_L,
            "AL_SD_L": AL_SD_L,
            "VKT_SD_L": VKT_SD_L,
            "LD_SD_L": LD_SD_L,
            "R1_L": R1_L,
            "R2_L": R2_L,
            "DeltaD_L": DeltaD_L,
            "R1_at_L": R1_at_L,
            "R2_at_L": R2_at_L,
            "DeltaD_at_L": DeltaD_at_L,
            "WZW_L": WZW_L,
            "RA_aim_L": RA_aim_L,
        }
    , index=[0])

    return data_row

### Extract Lens Info From PDF

#### v1.9

In [102]:
# Extract all potential lenses from page 1 and 2 (up to 4 on each page): their name and their 5 versions differing in strenght, including the predicted refraction error

def iol_lenses_v19(pdf_file):
    # initiate pytesseract
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    # convert pdf to image
    pages = convert_from_path(pdf_file)

    # distances between bounding boxes
    block_width = 350
    block_height = 355
    LR_width = 709
    lens_version_line_height = 31

    # inittiate df
    iol_v = pd.DataFrame()

    # iterate through page 1 and 2
    for p in range(2 if len(pages) >= 2 else 1):

        # use googles OCR technology in pytesseract to extract text from images.
        d = pytesseract.image_to_data(pages[p], output_type=pytesseract.Output.DICT)
        d_df = pd.DataFrame.from_dict(d)

        # look for page header "IOL-Berechnung" -> break loop if it does not exist
        header = extract_data_point(d_df, 700, 800, 630, 700)
        if not header or ("rechnung" not in header):
            break

        # extract general information to identify the IOL biometry
        patnr = extract_data_point(d_df, 360, 380, 220, 240)        # Patient number  
        exam_date = extract_data_point(d_df, 370, 410, 430, 470)    # examination date
        lens_cat = extract_data_point(d_df, 360, 380, 280, 310)     # lens category 

        # iterate through right and left side (eye R and eye L)
        for LR in range(2):
            # define side as string
            LR_str = "R" if LR == 0 else "L"

            RA_aim = extract_data_point(d_df, 275 + LR*LR_width, 300 + LR*LR_width, 895, 915)     # Refraction aim

            # iterate through the lens suggestions vertically: up to down
            for v_ver in range(2):

                # iterate through the lens suggestions horizontally: left to right
                for v_hor in range(2):

                    # lens name
                    lens = extract_data_point(
                        d_df,
                        170 + v_hor * block_width + LR * LR_width,
                        490 + v_hor * block_width + LR * LR_width,
                        1230 + v_ver * block_height,
                        1280 + v_ver * block_height,
                    )   

                    # lens a constants
                    lens_a_const = extract_data_point(
                        d_df,
                        150 + v_hor * block_width + LR * LR_width,
                        480 + v_hor * block_width + LR * LR_width,
                        1310 + v_ver * block_height,
                        1340 + v_ver * block_height,
                    )

                    # iterate through each of the 5 lens suggestions (IOL in dpt) and their according expected refraction error (Refr in dpt)
                    for v in range(5):

                        # left column of potential lens versions (IOL in dpt)
                        iol = extract_data_point(
                            d_df,
                            215 + v_hor * block_width + LR * LR_width,
                            325 + v_hor * block_width + LR * LR_width,
                            1370 + v_ver * block_height + v * lens_version_line_height,
                            1400 + v_ver * block_height + v * lens_version_line_height,
                        )

                        # right column of potential lens versions (predicted Refraction in dpt)
                        refr = extract_data_point(
                            d_df,
                            378 + v_hor * block_width + LR * LR_width,
                            455 + v_hor * block_width + LR * LR_width,
                            1370 + v_ver * block_height + v * lens_version_line_height,
                            1400 + v_ver * block_height + v * lens_version_line_height,
                        )

                        # build dataframe
                        iol_v_line = pd.DataFrame(
                            {
                                "filename": pdf_file,
                                "patnr": patnr,
                                "exam_date": exam_date,
                                "lens_cat": lens_cat,
                                "RA_aim": RA_aim,
                                "side": LR_str,
                                "lens": lens,
                                "v": v + 1,
                                "iol": [iol],
                                "refr": [refr],
                                "lens_a_const" : lens_a_const
                            }
                        )
                        iol_v = pd.concat([iol_v, iol_v_line], axis=0)

    return iol_v

#### v1.7

In [95]:
# Extract all potential lenses from page 1 and 2 (up to 4 on each page): their name and their 5 versions differing in strenght, including the predicted refraction error

def iol_lenses_v17(pdf_file):
    # initiate pytesseract
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    # convert pdf to image
    pages = convert_from_path(pdf_file)

    # distances between bounding boxes
    block_width = 350
    block_height = 398
    LR_width = 709
    lens_version_line_height = 29

    # inittiate df
    iol_v = pd.DataFrame()

    # iterate through page 1 and 2
    for p in range(2 if len(pages) >= 2 else 1):

        # use googles OCR technology in pytesseract to extract text from images.
        d = pytesseract.image_to_data(pages[p], output_type=pytesseract.Output.DICT)
        d_df = pd.DataFrame.from_dict(d)

        # look for page header "IOL-Berechnung" -> break loop if it does not exist
        header = extract_data_point(d_df, 700, 800, 710, 770)
        if not header or ("rechnung" not in header):
            break

        # extract general information to identify the IOL biometry
        patnr = extract_data_point(d_df, 360, 380, 220, 240)       # Patient number  
        exam_date = extract_data_point(d_df, 370, 410, 430, 470)    # examination date
        lens_cat = extract_data_point(d_df, 360, 380, 280, 310)    # lens category 


        # iterate through right and left side (eye R and eye L)
        for LR in range(2):
            # define side as string
            LR_str = "R" if LR == 0 else "L"

            RA_aim = extract_data_point(d_df, 250 + LR*LR_width, 280 + LR*LR_width, 1050, 1080)     # Refraction aim

            # iterate through the lens suggestions vertically: up to down
            for v_ver in range(2):

                # iterate through the lens suggestions horizontally: left to right
                for v_hor in range(2):

                    # lens name
                    lens = extract_data_point(
                        d_df,
                        170 + v_hor * block_width + LR * LR_width,
                        490 + v_hor * block_width + LR * LR_width,
                        1170 + v_ver * block_height,
                        1260 + v_ver * block_height,
                    )   

                    # lens a constants
                    lens_a_const = extract_data_point(
                        d_df,
                        150 + v_hor * block_width + LR * LR_width,
                        480 + v_hor * block_width + LR * LR_width,
                        1310 + v_ver * block_height,
                        1340 + v_ver * block_height,
                    )

                    # iterate through each of the 5 lens suggestions (IOL in dpt) and their according expected refraction error (Refr in dpt)
                    for v in range(5):

                        # left column of potential lens versions (IOL in dpt)
                        iol = extract_data_point(
                            d_df,
                            175 + v_hor * block_width + LR * LR_width,
                            215 + v_hor * block_width + LR * LR_width,
                            1375 + v_ver * block_height + v * lens_version_line_height,
                            1405 + v_ver * block_height + v * lens_version_line_height,
                        )

                        # right column of potential lens versions (predicted Refraction in dpt)
                        refr = extract_data_point(
                            d_df,
                            370 + v_hor * block_width + LR * LR_width,
                            440 + v_hor * block_width + LR * LR_width,
                            1375 + v_ver * block_height + v * lens_version_line_height,
                            1405 + v_ver * block_height + v * lens_version_line_height,
                        ) 

                        # build dataframe
                        iol_v_line = pd.DataFrame(
                            {
                                "filename": pdf_file,
                                "patnr": patnr,
                                "exam_date": exam_date,
                                "lens_cat": lens_cat,
                                "RA_aim": RA_aim,
                                "side": LR_str,
                                "lens": lens,
                                "v": v + 1,
                                "iol": [iol],
                                "refr": [refr],
                                "lens_a_const" : lens_a_const
                            }
                        )
                        iol_v = pd.concat([iol_v, iol_v_line], axis=0)

    return iol_v

#### v1.5

In [126]:
# Extract all potential lenses from page 1 and 2 (up to 4 on each page): their name and their 5 versions differing in strenght, including the predicted refraction error

def iol_lenses_v15(pdf_file):
    # initiate pytesseract
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    # convert pdf to image
    pages = convert_from_path(pdf_file)

    # distances between bounding boxes
    block_width = 350
    block_height = 354
    LR_width = 709
    lens_version_line_height = 29
    height_corr = 60

    # inittiate df
    iol_v = pd.DataFrame()

    # iterate through page 1 and 2
    for p in range(2 if len(pages) >= 2 else 1):

        # use googles OCR technology in pytesseract to extract text from images.
        d = pytesseract.image_to_data(pages[p], output_type=pytesseract.Output.DICT)
        d_df = pd.DataFrame.from_dict(d)

        # look for page header "IOL-Berechnung" -> break loop if it does not exist
        header = extract_data_point(d_df, 700, 800, 720, 775)
        if not header or ("rechnung" not in header):
            break

        # extract general information to identify the IOL biometry
        patnr = extract_data_point(d_df, 280, 320, 180, 220)       # Patient number  
        exam_date = extract_data_point(d_df, 370, 410, 430, 470)    # examination date
        lens_cat = extract_data_point(d_df, 290, 420, 250, 300)    # lens category 


        # iterate through right and left side (eye R and eye L)
        for LR in range(2):
            # define side as string
            LR_str = "R" if LR == 0 else "L"

            # Refraction aim
            RA_aim = extract_data_point(d_df, 290 + LR*LR_width, 320 + LR*LR_width, 1125, 1145)     

            # iterate through the lens suggestions vertically: up to down
            for v_ver in range(2):

                # iterate through the lens suggestions horizontally: left to right
                for v_hor in range(2):

                    # lens name
                    lens = extract_data_point(
                        d_df,
                        165 + v_hor * block_width + LR * LR_width,
                        490 + v_hor * block_width + LR * LR_width,
                        1170 + v_ver * block_height + height_corr,
                        1260 + v_ver * block_height + height_corr,
                    )   

                    # lens a constants
                    lens_a_const = extract_data_point(
                        d_df,
                        150 + v_hor * block_width + LR * LR_width,
                        480 + v_hor * block_width + LR * LR_width,
                        1310 + v_ver * block_height + height_corr,
                        1340 + v_ver * block_height + height_corr,
                    )

                    # iterate through each of the 5 lens suggestions (IOL in dpt) and their according expected refraction error (Refr in dpt)
                    for v in range(5):

                        # left column of potential lens versions (IOL in dpt)
                        iol = extract_data_point(
                            d_df,
                            175 + v_hor * block_width + LR * LR_width,
                            215 + v_hor * block_width + LR * LR_width,
                            1375 + v_ver * block_height + v * lens_version_line_height + height_corr,
                            1405 + v_ver * block_height + v * lens_version_line_height + height_corr,
                        )

                        # right column of potential lens versions (predicted Refraction in dpt)
                        refr = extract_data_point(
                            d_df,
                            370 + v_hor * block_width + LR * LR_width,
                            440 + v_hor * block_width + LR * LR_width,
                            1375 + v_ver * block_height + v * lens_version_line_height + height_corr,
                            1405 + v_ver * block_height + v * lens_version_line_height + height_corr,
                        ) 

                        # build dataframe
                        iol_v_line = pd.DataFrame(
                            {
                                "filename": pdf_file,
                                "patnr": patnr,
                                "exam_date": exam_date,
                                "lens_cat": lens_cat,
                                "RA_aim": RA_aim,
                                "side": LR_str,
                                "lens": lens,
                                "v": v + 1,
                                "iol": [iol],
                                "refr": [refr],
                                "lens_a_const" : lens_a_const
                            }
                        )
                        iol_v = pd.concat([iol_v, iol_v_line], axis=0)

    return iol_v

## Run Pipeline

### Call PDFs

In [5]:
# call all pdfs from folder for each version
pdf_files_sorted_v15 = list_pdf("data/IOL/v1.5")

pdf_files_sorted_v17 = list_pdf("data/IOL/v1.7")

pdf_files_sorted_v19 = list_pdf("data/IOL/v1.9")

print(f"n = {len(pdf_files_sorted_v15)} (v1.5), {len(pdf_files_sorted_v17)} (v1.7), {len(pdf_files_sorted_v19)} (v1.9)")

n = 4141 (v1.5), 8350 (v1.7), 8382 (v1.9)


### Extract IOL data

In [None]:
# create empty df
data = pd.DataFrame()

# extract all info from each pdf and add in as a row to the df
for file in pdf_files_sorted_v19[0:20]:
    print(file)
    data_row = extract_iol_v19(file)
    data = pd.concat([data, data_row], axis=0)

In [None]:
data

In [9]:
data.to_csv("data/iol_v1.9_v2.csv", index=False)

### Extract Lens Info

In [133]:
# create empty df
data_lenses_v15 = pd.DataFrame()

# extract all info from each pdf and add in as a row to the df
c = 0
for file in pdf_files_sorted_v15:
    c += 1
    print(f"{c}/{len(pdf_files_sorted_v15)} {file}")
    data_lenses_row = iol_lenses_v15(file)
    data_lenses_v15 = pd.concat([data_lenses_v15, data_lenses_row], axis=0)

1/4141 data\IOL\v1.5\{e692591e-371c-4a8e-a7c3-d6da27fe1a8e}.pdf
2/4141 data\IOL\v1.5\{946c5d4b-aa6a-4a0d-a0ce-f4715b916128}.pdf
3/4141 data\IOL\v1.5\{86d6b125-c35d-4265-9a24-a9a96332bf62}.pdf
4/4141 data\IOL\v1.5\{0114ddf7-8a47-4706-ac4e-6e33e2f01197}.pdf
5/4141 data\IOL\v1.5\{ee20a7a9-93d7-42cc-a5cd-0bda3481c80e}.pdf
6/4141 data\IOL\v1.5\{f7f34e6e-70f7-4dc8-9c49-a15361d84ecb}.pdf
7/4141 data\IOL\v1.5\{14d12396-82a7-40a9-8aa9-86ad41a42688}.pdf
8/4141 data\IOL\v1.5\{da3edf9d-7de8-47e1-8394-8143f5ffc9f7}.pdf
9/4141 data\IOL\v1.5\{8c4a0055-31e0-4368-9b8e-2f51b1ed0b76}.pdf
10/4141 data\IOL\v1.5\{ea71ec37-4524-4714-b45d-77071af26674}.pdf
11/4141 data\IOL\v1.5\{52908a35-cbbb-4758-be86-4cb1fc45e162}.pdf
12/4141 data\IOL\v1.5\{bf66ca93-b42b-40d9-bca4-1f625c90e51f}.pdf
13/4141 data\IOL\v1.5\{a71b42bd-04b5-4a50-bd10-4c01e2d280dc}.pdf
14/4141 data\IOL\v1.5\{2b8a19fd-b69f-4596-b834-ea9886a099a4}.pdf
15/4141 data\IOL\v1.5\{92c8f3a3-eb64-4030-aa16-191c4f34775c}.pdf
16/4141 data\IOL\v1.5\{d9f566fd-1c

In [135]:
data_lenses_v15.to_csv("data/iol_lenses_v1.5.csv", index=False)

In [136]:
# create empty df
data_lenses_v17 = pd.DataFrame()

# extract all info from each pdf and add in as a row to the df
c = 0
for file in pdf_files_sorted_v17:
    c += 1
    print(f"{c}/{len(pdf_files_sorted_v17)} {file}")
    data_lenses_row = iol_lenses_v17(file)
    data_lenses_v17 = pd.concat([data_lenses_v17, data_lenses_row], axis=0)

1/8350 data\IOL\v1.7\c0dce91d-6197-422e-af18-3e993d1a5ccc.pdf
2/8350 data\IOL\v1.7\b137e974-5ceb-4410-b8c3-24c29c89e376.pdf
3/8350 data\IOL\v1.7\f8b18149-8758-46ac-a891-2a85ea3d1176.pdf
4/8350 data\IOL\v1.7\92e470be-c9e4-4e19-8343-bd34944cf357.pdf
5/8350 data\IOL\v1.7\e041258f-7c50-4445-93c5-fe9aa7991641.pdf
6/8350 data\IOL\v1.7\fc85a83c-e6e5-4e2d-9286-449100ace989.pdf
7/8350 data\IOL\v1.7\f35bdd63-ce65-4966-ba4b-248c248b3582.pdf
8/8350 data\IOL\v1.7\51b03877-e040-4bfb-baf5-ac71b4f32324.pdf
9/8350 data\IOL\v1.7\9bc33f25-e184-4892-a5a9-7b0619a36ca2.pdf
10/8350 data\IOL\v1.7\775e830a-5805-4157-8c5c-9f784daac2d1.pdf
11/8350 data\IOL\v1.7\5d8c22e4-8d1d-44fd-a5d4-6b366e48052a.pdf
12/8350 data\IOL\v1.7\7b4ed99a-062c-45f8-b899-52c3bc9a4a48.pdf
13/8350 data\IOL\v1.7\e0adb5f7-e684-4816-8aaf-e827c288b189.pdf
14/8350 data\IOL\v1.7\c5725db7-d8cc-4c93-994a-275473c08b12.pdf
15/8350 data\IOL\v1.7\e3e28724-f333-4df8-b830-5073870d4752.pdf
16/8350 data\IOL\v1.7\7401b344-2c85-42bc-9aa6-5ba6c5b18407.pdf
1

In [138]:
data_lenses_v17.to_csv("data/iol_lenses_v1.7.csv", index=False)

In [139]:
# create empty df
data_lenses_v19 = pd.DataFrame()

# extract all info from each pdf and add in as a row to the df
c = 0
for file in pdf_files_sorted_v19:
    c += 1
    print(f"{c}/{len(pdf_files_sorted_v19)} {file}")
    data_lenses_row = iol_lenses_v19(file)
    data_lenses_v19 = pd.concat([data_lenses_v19, data_lenses_row], axis=0)

1/8382 data\IOL\v1.9\74323986-5d01-4a0c-91a0-36a95765b185.pdf
2/8382 data\IOL\v1.9\f92fe0e3-ddc3-4c95-b6d4-f3865cd08ac6.pdf
3/8382 data\IOL\v1.9\d113bc5b-190e-480b-92ee-29e971ab1f6e.pdf
4/8382 data\IOL\v1.9\6a4a4e56-2a2a-4715-bd5c-ebcbd108b736.pdf
5/8382 data\IOL\v1.9\65ee836a-9b53-48a1-a2b6-8e97cbdd543a.pdf
6/8382 data\IOL\v1.9\d81a1449-bf93-427e-ba28-625b024725d5.pdf
7/8382 data\IOL\v1.9\5c4191dc-8ac5-4286-a347-daf17d10407a.pdf
8/8382 data\IOL\v1.9\cfe1beec-bb69-4e5f-834f-f385e02bfade.pdf
9/8382 data\IOL\v1.9\a25e75d2-247b-4f20-950c-b1dee4c4fe73.pdf
10/8382 data\IOL\v1.9\141610eb-54fa-4d27-b906-c2a6c73a3dbf.pdf
11/8382 data\IOL\v1.9\5747d0fe-ccec-48ba-a11a-c9b4bd38f52c.pdf
12/8382 data\IOL\v1.9\170554be-3135-44ad-bc97-8e126aff403a.pdf
13/8382 data\IOL\v1.9\b93fb683-d071-421b-89ab-3f2052c46ba3.pdf
14/8382 data\IOL\v1.9\543ba8eb-465a-4e5a-adb3-64929bd88577.pdf
15/8382 data\IOL\v1.9\fd34ff4d-4db5-4baf-9975-cecff8050699.pdf
16/8382 data\IOL\v1.9\65427eba-5e6c-46ef-b72a-9de82c400073.pdf
1

In [141]:
data_lenses_v19.to_csv("data/iol_lenses_v1.9.csv", index=False)

# Testing Individual PDFs

In [6]:
pdf_file = pdf_files_sorted_v19[0]


# initiate pytesseract and load 
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# convert pdf to image
pages = convert_from_path(pdf_file)

# use googles OCR technology in pytesseract to extract text from images
d = pytesseract.image_to_data(pages[0], output_type=pytesseract.Output.DICT)
d_df = pd.DataFrame.from_dict(d)
d_df


Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text
0,1,1,0,0,0,0,0,0,1653,2339,-1,
1,2,1,1,0,0,0,84,1304,16,418,-1,
2,3,1,1,1,0,0,84,1304,16,418,-1,
3,4,1,1,1,1,0,84,1304,16,418,-1,
4,5,1,1,1,1,1,91,1717,1,5,92,-
...,...,...,...,...,...,...,...,...,...,...,...,...
674,4,1,77,1,1,0,1457,2143,111,14,-1,
675,5,1,77,1,1,1,1457,2143,42,14,96,Seite
676,5,1,77,1,1,2,1508,2143,5,14,90,1
677,5,1,77,1,1,3,1522,2147,30,10,90,von


In [110]:
pdf_file

'data\\IOL\\v1.5\\{e692591e-371c-4a8e-a7c3-d6da27fe1a8e}.pdf'

In [113]:
extract_data_point(d_df, 700, 800, 720, 775) # Page Header like IOL-Berechnung
extract_data_point(d_df, 170, 490, 1170, 1260) # Lens type R_1
#extract_data_point(d_df, 150, 480, 1310, 1340) # Lens type a konstante R_1
#extract_data_point(d_df, 175, 215, 1375, 1410) # Lens_IOL_v1 R_1
#extract_data_point(d_df, 370, 440, 1375, 1410) # Lens_Ref_v1 R_1



'Phak; GS: Glask6rper; Chirurgie: Unbehandelt;  '

In [None]:
test = extract_iol_v19(pdf_file)
test.T