In [1]:
import fitz  #PDF Extractor
import numpy as np
import glob
import os

# Directory containing the PDF files
directory_path = "/Users/stempel/Desktop/UWMRRC/Linac Measurements/"

# Get all PDF files in the directory
pdf_paths = glob.glob(os.path.join(directory_path, "*.pdf"))

# Sort files if needed (optional)
pdf_paths.sort()
numDocs = len(pdf_paths)

#Setup dictionaries to hold all data extracted from pdf. ICP is the profiler, IC is chamber in phantom.
data = {
    "ICP": {
        "photon": {},
        "electron": {}
    },
    "IC": {
        "photon": {},
        "electron": {}
    }
}

for day, pdf_path in enumerate(pdf_paths, start=1):
    doc = fitz.open(pdf_path)

    #Loop through each page
    page_num = 0
    while page_num < len(doc):
        page = doc[page_num]
        text = page.get_text()
        lines = text.split("\n")
        
        #Loop line by line
        i = 0
        while i < len(lines):
            line = lines[i]
            if "quad wedges" in line: #Whether we are using ic profiler or ic in phantom can be checked in this logic
                quad_wedges = True
            else:
                quad_wedges = False

            
            if "Photon" in line: #What particle type is being used
                energy_level = line.split(" - ")[1].strip()

                if quad_wedges: #Will setup energy levels, i.e. 6MV 10MV etc.
                    if energy_level not in data["ICP"]["photon"]:
                        data["ICP"]["photon"][energy_level] = {}
                else:
                    if energy_level not in data["IC"]["photon"]:
                        data["IC"]["photon"][energy_level] = {}
                
                #For each energy level that was tested, this will go through and take only the measurement for each specified section we want, i.e. flatness symmetry, dose etc. This excludes all other info in the pdf
                i += 1
                while i < len(lines) or page_num < len(doc) - 1:
                    if i == len(lines):
                        page_num += 1
                        page = doc[page_num]
                        text = page.get_text()
                        lines = text.split("\n")
                        i = 0
                        continue
                    
                    if "Electron" in lines[i] or "Photon" in lines[i]: #Breaks the loop when a new section is reached
                        break
                    
                    if "MU" in lines[i] or "profile constancy" in lines[i] or "Flatness" in lines[i] or "Symmetry" in lines[i] or "Beam Quality" in lines[i] or "Dose" in lines[i] or "Wedge factor" in lines[i] or "EDW" in lines[i] or "energy" in lines[i] and not "Photon" in lines[i] and not "Electron" in lines[i]:
                        subsection = lines[i].split(" ")[0].strip()
                        measurement = float(lines[i+1].strip())
                        
                        if quad_wedges: #where the data is actually added to the dictionary, making sure that no new dictionaries are being created
                            if subsection not in data["ICP"]["photon"][energy_level]:
                                data["ICP"]["photon"][energy_level][subsection] = {}
                            data["ICP"]["photon"][energy_level][subsection][day] = measurement
                        else:
                            if subsection not in data["IC"]["photon"][energy_level]:
                                data["IC"]["photon"][energy_level][subsection] = {}
                            data["IC"]["photon"][energy_level][subsection][day] = measurement
                    i += 1

            #Exact same process but with electron
            elif "Electron constancy with quad wedges" in line: 
                energy_level = line.split(" - ")[1].strip()
                
                if quad_wedges:
                    if energy_level not in data["ICP"]["electron"]:
                        data["ICP"]["electron"][energy_level] = {}
                else:
                    if energy_level not in data["IC"]["electron"]:
                        data["IC"]["electron"][energy_level] = {}
                
                
                i += 1
                while i < len(lines) or page_num < len(doc) - 1:
                    if i == len(lines):
                        page_num += 1
                        page = doc[page_num]
                        text = page.get_text()
                        lines = text.split("\n")
                        i = 0
                        continue
                    
                    if "Photon" in lines[i] or "Electron" in lines[i]:
                        break
                    
                    if "MU constancy" in lines[i] or "profile constancy" in lines[i] or "Flatness" in lines[i] or "Symmetry" in lines[i] or "Beam Quality" in lines[i] or "Dose" in lines[i] or "energy" in lines[i] or "EDW" in lines[i]:
                        subsection = lines[i].split(" ")[0].strip()
                        measurement = float(lines[i+1].strip())
                        
                        if quad_wedges:
                            if subsection not in data["ICP"]["electron"][energy_level]:
                                data["ICP"]["electron"][energy_level][subsection] = {}
                            data["ICP"]["electron"][energy_level][subsection][day] = measurement
                        else:
                            if subsection not in data["IC"]["electron"][energy_level]:
                                data["IC"]["electron"][energy_level][subsection] = {}
                            data["IC"]["electron"][energy_level][subsection][day] = measurement
                    i += 1
            
            #Same process again because of typage intricacies in the specific pdf
            elif "Electron constancy - VW Stack" in line: 
                energy_level = line.split(" - ")[2].strip()
                
                if quad_wedges:
                    if energy_level not in data["ICP"]["electron"]:
                        data["ICP"]["electron"][energy_level] = {}
                else:
                    if energy_level not in data["IC"]["electron"]:
                        data["IC"]["electron"][energy_level] = {}
                
                i += 1
                while i < len(lines) or page_num < len(doc) - 1:
                    if i == len(lines):
                        page_num += 1
                        page = doc[page_num]
                        text = page.get_text()
                        lines = text.split("\n")
                        i = 0
                        continue
                    
                    if "Photon" in lines[i] or "Electron" in lines[i]:
                        break
                    
                    if "MU constancy" in lines[i] or "profile constancy" in lines[i] or "Flatness" in lines[i] or "Symmetry" in lines[i] or "Beam Quality" in lines[i] or "Dose" in lines[i] or "energy" in lines[i] or "EDW" in lines[i]:
                        subsection = lines[i].split(" ")[0].strip()
                        measurement = float(lines[i+1].strip())
                        
                        if quad_wedges:
                            if subsection not in data["ICP"]["electron"][energy_level]:
                                data["ICP"]["electron"][energy_level][subsection] = {}
                            data["ICP"]["electron"][energy_level][subsection][day] = measurement
                        else:
                            if subsection not in data["IC"]["electron"][energy_level]:
                                data["IC"]["electron"][energy_level][subsection] = {}
                            data["IC"]["electron"][energy_level][subsection][day] = measurement
                    i += 1
            else:
                i += 1
        
        page_num += 1


# Convert measurements to numpy arrays
for quad_wedges in ["ICP", "IC"]:
    for particle_type in ["photon", "electron"]:
        for energy_level, energy_data in data[quad_wedges][particle_type].items():
            for subsection, measurements in energy_data.items():
                data[quad_wedges][particle_type][energy_level][subsection] = np.array(list(measurements.values()))

# Print the extracted data
for quad_wedges in ["ICP", "IC"]:
    print(f"Data {quad_wedges}:")
    for particle_type in ["photon", "electron"]:
        print(f"  {particle_type.capitalize()}:")
        for energy_level, energy_data in data[quad_wedges][particle_type].items():
            print(f"    Energy Level: {energy_level}")
            for subsection, measurements in energy_data.items():
                print(f"      {subsection}: {measurements}")
            print()


Data ICP:
  Photon:
    Energy Level: 6 MV
      Beam: [0.42 0.04 0.18 0.25 0.27]
      Y-Flatness: [4.07 3.94 4.04 4.1  4.09]
      Y-Symmetry: [-1.91 -1.96 -1.98 -2.04 -1.96]
      X-Flatness: [4.41 4.26 4.38 4.43 4.45]
      X-Symmetry: [1.82 1.76 1.72 1.7  1.62]
      d10: [66.644 66.688 66.686 66.68  66.694]
      Adjusted: [0.999 1.001 1.007 1.005 1.007]
      Backup: [100. 100. 100. 100.]

    Energy Level: 10 MV
      Beam: [0.04 0.07 0.08 0.14 0.12]
      Y-Flatness: [4.1  4.14 4.15 4.19 4.18]
      Y-Symmetry: [-2.04 -2.07 -1.99 -2.08 -2.07]
      X-Flatness: [4.18 4.21 4.26 4.31 4.3 ]
      X-Symmetry: [1.82 1.74 1.68 1.65 1.54]
      d10: [73.997 73.997 74.005 74.005 74.008]
      Adjusted: [1.004 1.001 1.006 1.005 1.009]
      Backup: [100. 100. 100. 100.]

    Energy Level: 15 MV
      Beam: [0.06 0.02 0.27 0.05 0.05]
      Y-Flatness: [2.72 2.76 2.65 2.82 2.83]
      Y-Symmetry: [ 1.25  1.28 -1.22 -1.37 -1.31]
      X-Flatness: [2.77 2.82 2.7  2.85 2.87]
      X-Symmetry