In [26]:
import fitz  #PDF Extractor
import numpy as np
import glob
import os


# Directory containing SunCheck files
directory_path = "/Users/stempel/Desktop/UWMRRC/Linac Measurements/"

# Get all pdf files in the directory
pdf_paths = glob.glob(os.path.join(directory_path, "*.pdf"))

# Sort files so dates are ordered
pdf_paths.sort()
numDocs = len(pdf_paths)

# Setup dictionaries to hold all data extracted from pdf. ICP is the profiler, IC is chamber in phantom.
data = {
    "ICP": {
        "photon": {},
        "electron": {}
    },
    "IC": {
        "photon": {},
        "electron": {}
    }
}

# Other setup for finding data in pdf (in theory could be changed depending on pdf)
keywords = ("MU", "profile constancy", "Flatness", "Symmetry", "Beam Quality", "Dose", "Wedge factor", "EDW", "energy") 
exclusions = ("Photon", "Electron")

In [27]:
# Function setup
def checkSubsection(line): # Determines if at a subsection containing data or not
    if any(keyword in line for keyword in keywords) and not any(exclusion in lines for exclusion in exclusions):
        return True
    else:
        return False

def getSubsection(line): # Returns the name of the subsection (complicated since names vary greatly)
    space = line.find(' ')
    remainingText = line[space+1:].strip()
    if any(char.isdigit() for char in remainingText):
        return line[:space].strip()  # Extract everything before the first space
    else:
        return line.strip() 

def getQuadWedges(line): # Used to check if profiler is being used
    if "quad wedges" in line:
        return True
    else:
        return False

def getParticleType(line): # Determines photon or electron
    if "Photon" in line:
        return 'photon'
    elif "Electron" in line:
        return 'electron'

def getEnergyLevel(line): # Returns energy level, 6 MV, 10 MV etc.
    return line.split(" - ")[1].strip()

def getMeasurement(line): # Returns the measured value from SunCheck
    return float(lines[i+1].strip())


def addMeasurement(chamberType, particleType, energyLevel, subsection, day): # Adds measured value and creates subdictionary if doesn't exist
    if subsection not in data[chamberType][particleType][energyLevel]:
        data[chamberType][particleType][energyLevel][subsection] = {}
    data[chamberType][particleType][energyLevel][subsection][day] = measurement

In [28]:
# Main loop
for day, pdf_path in enumerate(pdf_paths, start=1):
    doc = fitz.open(pdf_path)

    # Loop through each page
    page_num = 0
    while page_num < len(doc):
        page = doc[page_num] # Sets up document variables to loop
        text = page.get_text()
        lines = text.split("\n")
        
        # Loop line by line
        i = 0 # line counter
        while i < len(lines):

            quadWedges = getQuadWedges(lines[i]) # Determines whether profiler is being used, returns boolean

            if getParticleType(lines[i]) == 'photon' : # Extract data for photons
                energyLevel = getEnergyLevel(lines[i])

                if quadWedges: # Will setup energy levels, e.g. 6MV 10MV etc.
                    if energyLevel not in data["ICP"]["photon"]:
                        data["ICP"]["photon"][energyLevel] = {}
                else:
                    if energyLevel not in data["IC"]["photon"]:
                        data["IC"]["photon"][energyLevel] = {}
                
                # For each energy level that was tested, this will go through and take only the measurement for each specified section we want, i.e. flatness symmetry, dose etc. This excludes all other info in the pdf
                i += 1
                while i < len(lines) or page_num < len(doc) - 1: 
                    if i == len(lines):
                        page_num += 1 # annoying code to not stop at the end of the page since some data spans multiple pages
                        page = doc[page_num]
                        text = page.get_text()
                        lines = text.split("\n")
                        i = 0
                        continue
                    
                    if getParticleType(lines[i]) in ['photon', 'electron']: # Breaks the loop when a new section is reached
                        break
                    
                    if checkSubsection(lines[i]): # Checks to see if the line we are on is a line containing a subsection that has data we want.
                        subsection = getSubsection(lines[i])
                        measurement = getMeasurement(lines[i])
                        
                        if quadWedges: # Where the data is actually added to the dictionary, making sure that no new dictionaries are being created
                            addMeasurement('ICP','photon', energyLevel, subsection, day)
               
                        else:
                            addMeasurement('IC','photon', energyLevel, subsection, day)
                    i += 1

            # Exact same process but with electron
            elif getParticleType(lines[i]) == 'electron': 
                energyLevel = getEnergyLevel(lines[i])
                
                if quadWedges:
                    if energyLevel not in data["ICP"]["electron"]:
                        data["ICP"]["electron"][energyLevel] = {}
                else:
                    if energyLevel not in data["IC"]["electron"]:
                        data["IC"]["electron"][energyLevel] = {}
                
                i += 1
                while i < len(lines) or page_num < len(doc) - 1:
                    if i == len(lines):
                        page_num += 1
                        page = doc[page_num]
                        text = page.get_text()
                        lines = text.split("\n")
                        i = 0
                        continue
                    
                    if getParticleType(lines[i]) in ['photon', 'electron']:
                        break
                    
                    if checkSubsection(lines[i]):
                        subsection = getSubsection(lines[i])
                        measurement = float(lines[i+1].strip())
                        
                        if quadWedges:
                            addMeasurement('ICP','electron', energyLevel, subsection, day)
                        else:
                            addMeasurement('IC','electron', energyLevel, subsection, day)
                    i += 1
            
            #Same process again because of typage intricacies in the specific pdf
            elif "Electron constancy - VW Stack" in lines[i]: 
                energyLevel = lines[i].split(" - ")[2].strip()
                
                if quadWedges:
                    if energyLevel not in data["ICP"]["electron"]:
                        data["ICP"]["electron"][energy_level] = {}
                else:
                    if energyLevel not in data["IC"]["electron"]:
                        data["IC"]["electron"][energyLevel] = {}
                
                i += 1
                while i < len(lines) or page_num < len(doc) - 1:
                    if i == len(lines):
                        page_num += 1
                        page = doc[page_num]
                        text = page.get_text()
                        lines = text.split("\n")
                        i = 0
                        continue
                    
                    if getParticleType(lines[i]) in ['photon', 'electron']:
                        break
                    
                    if checkSubsection(lines[i]):
                        subsection = getSubsection(lines[i])
                        measurement = getMeasurement(lines[i])
                        
                        if quadWedges:
                            addMeasurement('ICP','electron', energyLevel, subsection, day)
            
                        else:
                            addMeasurement('IC','electron', energyLevel, subsection, day)
                        
                    i += 1
            else:
                i += 1
        
        page_num += 1

In [29]:
# Convert to numpy arrays
for quadWedges in ["ICP", "IC"]:
    for particleType in ["photon", "electron"]:
        for energyLevel, energyData in data[quadWedges][particleType].items():
            for subsection, measurements in energyData.items():
                data[quadWedges][particleType][energyLevel][subsection] = np.array(list(measurements.values()))

# Print the data
for quadWedges in ["ICP", "IC"]:
    print(f"Data {quadWedges}:")
    for particleType in ["photon", "electron"]:
        print(f"  {particleType.capitalize()}:")
        for energyLevel, energyData in data[quadWedges][particleType].items():
            print(f"    Energy Level: {energyLevel}")
            for subsection, measurements in energyData.items():
                print(f"      {subsection}: {measurements}")
            print()

Data ICP:
  Photon:
    Energy Level: 6 MV
      Beam profile constancy: [0.42 0.04 0.18 0.25 0.27]
      Y-Flatness: [4.07 3.94 4.04 4.1  4.09]
      Y-Symmetry: [-1.91 -1.96 -1.98 -2.04 -1.96]
      X-Flatness: [4.41 4.26 4.38 4.43 4.45]
      X-Symmetry: [1.82 1.76 1.72 1.7  1.62]
      d10 Beam Quality: [66.644 66.688 66.686 66.68  66.694]
      Adjusted Dose / MU: [0.999 1.001 1.007 1.005 1.007]
      Backup MU constancy: [100. 100. 100. 100.]

    Energy Level: 10 MV
      Beam profile constancy: [0.04 0.07 0.08 0.14 0.12]
      Y-Flatness: [4.1  4.14 4.15 4.19 4.18]
      Y-Symmetry: [-2.04 -2.07 -1.99 -2.08 -2.07]
      X-Flatness: [4.18 4.21 4.26 4.31 4.3 ]
      X-Symmetry: [1.82 1.74 1.68 1.65 1.54]
      d10 Beam Quality: [73.997 73.997 74.005 74.005 74.008]
      Adjusted Dose / MU: [1.004 1.001 1.006 1.005 1.009]
      Backup MU constancy: [100. 100. 100. 100.]

    Energy Level: 15 MV
      Beam profile constancy: [0.06 0.02 0.27 0.05 0.05]
      Y-Flatness: [2.72 2.76 2