In [5]:
import ipywidgets as widgets
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import imageio
import os
import json
import pickle
import joblib
import h5py
import openpyxl

from monai.transforms import (
    Compose,
    Resize,
    RandRotate,
    Affine,
    RandGaussianNoise,
    RandZoom,
    RepeatChannel,
)

import torch
from torch import Tensor
import torch.optim
import torch.utils.data
import torchvision
import torchvision.transforms as transforms

In [2]:
baseDir="../duke/Breast_MRI_002"


patientData={}

pTotal=3

outDir="data"



with h5py.File(f'{outDir}/firstpass_{pTotal}.h5','w') as patientData:
    for pNum in range(1,pTotal): # change 3 to 500 or however many # in Breast_MRI_###
        patientDir=f"{baseDir}/Breast_MRI_{str(pNum).zfill(3)}"
        if os.path.exists(f"{patientDir}") and os.path.isdir(f"{patientDir}"):
            if len(os.listdir(patientDir)) ==1:
                scanType=os.listdir(patientDir)[0]
                images=[]
                readFail=False
                for scanLabel in os.listdir(f"{patientDir}/{scanType}"):
                    if os.path.isdir(f"{patientDir}/{scanType}/{scanLabel}") and ('ph1' in scanLabel.lower() or '1st' in scanLabel.lower() or 'firstpass' in scanLabel.lower()):
                        #images=[]
                        zCords=[]
                        for dcmFile in os.listdir(f"{patientDir}/{scanType}/{scanLabel}"):
                            ### we should be able to rely on alphanumeric order to retain consistency in volumetric ordering,
                            ### but if that's not always true, we can find the max number and increment through a range with zfill padding
                            filePath=f"{patientDir}/{scanType}/{scanLabel}/{dcmFile}"
                            if dcmFile[-3:]=='dcm' and not readFail:
                                try:
                                    im = imageio.imread(f'{filePath}')
                                    images.append(np.asarray(im))
                                    if len(zCords)<2:
                                        if 'ImagePositionPatient' in im._meta and len(im._meta['ImagePositionPatient'])==3:
                                            zCords.append(im._meta['ImagePositionPatient'][2])
                                except:
                                    readFail=True
                                    print(f"error with file {filePath}\n\n")
                        images=np.asarray(images)
                        if images.max()!=0:
                            images=(images-images.min())/(images.max()-images.min())
                            
                        else:
                            images=[]
                        if len(zCords)==2: ## do this after L,R bounding box 
                            if zCords[0]-zCords[1]>0:
                                images=images[::-1,:,:] #standardize top-down slice orientation
                        else:
                            print(f"zCords!=2 for {patientDir}")

                if type(images)!=list and not readFail:
                    grp=patientData.create_group(f"{pNum}")
                    grp.attrs['shape']=images.shape
                    grp.create_dataset('data',data=images.astype(np.float16),compression="gzip", compression_opts=4)
                #patientData.append(images)
            else:
                print(f"multiple scanTypes for patient {patientDir}")
        else:
            print(f"folder error for patient {patientDir}")



  im = imageio.imread(f'{filePath}')


In [4]:
metaDir="../duke"

boundingBoxXL=openpyxl.load_workbook(f"{metaDir}/Annotation_Boxes.xlsx")
interestSheet=boundingBoxXL[boundingBoxXL.sheetnames[0]] #B2:B start_row, C2:C end_row 

patientBoundBoxCol=dict()
for i in range(2,interestSheet.max_row+1):
    if type(interestSheet[f"A{i}"].value) == str is not None and "breast_mri_" in interestSheet[f"A{i}"].value.lower()\
    and type(interestSheet[f"D{i}"].value)==int and type(interestSheet[f"E{i}"].value)==int:
        patientBoundBoxCol[str(int(interestSheet[f"A{i}"].value.split("_")[-1]))]=(interestSheet[f"D{i}"].value,interestSheet[f"E{i}"].value)


boundingBoxXL.close()

midOverlapKeys=set()

baseDir="../duke/Breast_MRI_002"


boundBoxFinals=[]

normalizeLR=True

outDir="data"

lowerBound=.20

with h5py.File(f'{outDir}/FP_LRnorm_crop_point{int(lowerBound*100)}Thresh.h5','w') as patientData:
    for pDir in os.listdir(baseDir):
        patientDir=f"{baseDir}/{pDir}"
        if pDir.split("_")[-1].isnumeric():
            pNum=int(pDir.split("_")[-1]) 
            key=str(pNum)# use str to access patientBoundBoxCol
            if len(os.listdir(patientDir)) ==1:
                scanType=os.listdir(patientDir)[0]
                images=[]
                readFail=False
                for scanLabel in os.listdir(f"{patientDir}/{scanType}"):
                    if os.path.isdir(f"{patientDir}/{scanType}/{scanLabel}") and ('ph1' in scanLabel.lower() or '1st' in scanLabel.lower() or 'firstpass' in scanLabel.lower()):
                        #images=[]
                        zCords=[]
                        for dcmFile in os.listdir(f"{patientDir}/{scanType}/{scanLabel}"):
                            ### we should be able to rely on alphanumeric order to retain consistency in volumetric ordering,
                            ### but if that's not always true, we can find the max number and increment through a range with zfill padding
                            filePath=f"{patientDir}/{scanType}/{scanLabel}/{dcmFile}"
                            if dcmFile[-3:]=='dcm' and not readFail:
                                try:
                                    im = imageio.imread(f'{filePath}')
                                    images.append(np.asarray(im))
                                    if len(zCords)<2:
                                        if 'ImagePositionPatient' in im._meta and len(im._meta['ImagePositionPatient'])==3:
                                            zCords.append(im._meta['ImagePositionPatient'][2])
                                except:
                                    readFail=True
                                    print(f"error with file {filePath}\n\n")
                        images=np.asarray(images)
                        
                        ##crop: find Boundbox>thresh, but do not crop yet
                        ## first, flip on rows if

                        if not readFail and images.max()!=0:
                            

                            images=(images-images.min())/(images.max()-images.min())
                            images=images.astype(np.float16)
                            midCol=images.shape[2]//2
                            if (patientBoundBoxCol[key][0]<midCol and patientBoundBoxCol[key][1]<midCol) or (patientBoundBoxCol[key][0]>midCol and patientBoundBoxCol[key][1]>midCol): #not in middle
                                grp=patientData.create_group(key)
                                xs,ys,zs = np.where(images>lowerBound)  
                                if np.min(ys)<images.shape[1]-np.max(ys): # flip rows
                                    images = images[:,::-1,:]
                                    grp.attrs['flipRows']=True
                                else:
                                    grp.attrs['flipRows']=False
                                if len(zCords)==2: ## do this after L,R bounding box 
                                    if zCords[0]-zCords[1]>0:
                                        images=images[::-1,:,:] #standardize top-down slice orientation
                                        grp.attrs['flipZ']=True
                                    else:
                                        grp.attrs['flipZ']=False
                                else:
                                    print(f"zCords!=2 for {patientDir}")
                                
                                imageL=images[:,:,:midCol]
                                imageR=images[:,:,midCol:]
                                if normalizeLR:
                                    imageL=(imageL-imageL.min())/(imageL.max()-imageL.min())
                                    imageR=(imageR-imageR.min())/(imageR.max()-imageR.min())
                                imageLR=[imageL,imageR]
                                for im in range(len(imageLR)):
                                    xs,ys,zs = np.where(imageLR[im]>lowerBound)
                                    boundingBox=[[np.min(xs),np.max(xs)+1],[np.min(ys),np.max(ys)+1],[np.min(zs),np.max(zs)+1]]
                                    boundBoxFinals.append([bb[1]-bb[0] for bb in boundingBox])
                                    imageLR[im]=imageLR[im][boundingBox[0][0]:boundingBox[0][1],boundingBox[1][0]:boundingBox[1][1],boundingBox[2][0]:boundingBox[2][1]]

                                grp.create_dataset('L',data=imageLR[0],compression="gzip", compression_opts=6)
                                grp.create_dataset('R',data=imageLR[1],compression="gzip", compression_opts=6)
                                ### check if L or R, then apply grp.attrs['LRflag']=[1,0] or [0,1]
                                if (patientBoundBoxCol[key][0]<midCol and patientBoundBoxCol[key][1]<midCol):
                                    grp.attrs['LRflag']=np.array([1,0])
                                else:
                                    grp.attrs['LRflag']=np.array([0,1])
                            else:
                                midOverlapKeys.add(key)

                        else:
                            print(f"max==0 for {patientDir}")
                            images=[]

            else:
                print(f"multiple scanTypes for patient {patientDir}")
        else:
            print(f'skipping {patientDir}')
boundBoxFinals=np.asarray(boundBoxFinals)
resizeAvg=boundBoxFinals.mean(axis=0)
print(resizeAvg)

  im = imageio.imread(f'{filePath}')


[151.   366.25 237.  ]


In [5]:
np.percentile(images, 99.5)

0.5281855549031121

In [7]:
im._meta.keys()

odict_keys(['TransferSyntaxUID', 'SOPClassUID', 'SOPInstanceUID', 'StudyDate', 'SeriesDate', 'AcquisitionDate', 'ContentDate', 'StudyTime', 'SeriesTime', 'AcquisitionTime', 'ContentTime', 'Modality', 'Manufacturer', 'StudyDescription', 'SeriesDescription', 'PatientName', 'PatientID', 'PatientBirthDate', 'PatientSex', 'SliceSpacing', 'StudyInstanceUID', 'SeriesInstanceUID', 'SeriesNumber', 'AcquisitionNumber', 'InstanceNumber', 'ImagePositionPatient', 'ImageOrientationPatient', 'SamplesPerPixel', 'Rows', 'Columns', 'PixelSpacing', 'BitsAllocated', 'BitsStored', 'HighBit', 'PixelRepresentation', 'PixelData', 'shape', 'sampling'])

In [7]:
im._meta['ImagePositionPatient']

(154.724, 176.048, -88.723)

In [6]:
im._meta['ImageOrientationPatient'][0] #cos of the first row and column with respect to the patient, cos(rowx,colx)=-1

-1.0

In [3]:
metaDir="../duke"

boundingBoxXL=openpyxl.load_workbook(f"{metaDir}/Annotation_Boxes.xlsx")
interestSheet=boundingBoxXL[boundingBoxXL.sheetnames[0]] #B2:B start_row, C2:C end_row 

patientBoundBoxCol=dict()
for i in range(2,interestSheet.max_row+1):
    if type(interestSheet[f"A{i}"].value) == str is not None and "breast_mri_" in interestSheet[f"A{i}"].value.lower()\
    and type(interestSheet[f"D{i}"].value)==int and type(interestSheet[f"E{i}"].value)==int:
        patientBoundBoxCol[str(int(interestSheet[f"A{i}"].value.split("_")[-1]))]=(interestSheet[f"D{i}"].value,interestSheet[f"E{i}"].value)


boundingBoxXL.close()

midOverlapKeys=set()

baseDir="../duke/Breast_MRI_002"


boundBoxFinals=[]

normalizeLR=True

outDir="data"


normalizeFn = lambda volume: np.clip((volume-volume.min())/(np.percentile(volume,99.5)-volume.min()),None,1)

lowerBound=.10

with h5py.File(f'{outDir}/FP_LRnorm_crop_point{int(lowerBound*100)}Thresh.h5','w') as patientData:
    for pDir in os.listdir(baseDir):
        patientDir=f"{baseDir}/{pDir}"
        if pDir.split("_")[-1].isnumeric():
            pNum=int(pDir.split("_")[-1]) 
            key=str(pNum)# use str to access patientBoundBoxCol
            if len(os.listdir(patientDir)) ==1:
                scanType=os.listdir(patientDir)[0]
                images=[]
                readFail=False
                for scanLabel in os.listdir(f"{patientDir}/{scanType}"):
                    if os.path.isdir(f"{patientDir}/{scanType}/{scanLabel}") and ('ph1' in scanLabel.lower() or '1st' in scanLabel.lower() or 'firstpass' in scanLabel.lower()):
                        #images=[]
                        zCords=[]
                        rowOrientation=[]
                        for dcmFile in os.listdir(f"{patientDir}/{scanType}/{scanLabel}"):
                            ### we should be able to rely on alphanumeric order to retain consistency in volumetric ordering,
                            ### but if that's not always true, we can find the max number and increment through a range with zfill padding
                            filePath=f"{patientDir}/{scanType}/{scanLabel}/{dcmFile}"
                            if dcmFile[-3:]=='dcm' and not readFail:
                                try:
                                    im = imageio.imread(f'{filePath}')
                                    images.append(np.asarray(im))
                                    if len(zCords)<2:
                                        if 'ImagePositionPatient' in im._meta and len(im._meta['ImagePositionPatient'])==3:
                                            zCords.append(im._meta['ImagePositionPatient'][2])
                                        if 'ImageOrientationPatient' in im._meta and len(im._meta['ImagePositionPatient'])==3:
                                            rowOrientation.append(im._meta['ImageOrientationPatient'][0])
                                except:
                                    readFail=True
                                    print(f"error with file {filePath}\n\n")
                        images=np.asarray(images)
                        
                        ##crop: find Boundbox>thresh, but do not crop yet
                        ## first, flip on rows if

                        if not readFail and images.max()!=0:
                            images=images.astype(np.float16)
                            images=normalizeFn(images)#(images-images.min())/(images.max()-images.min())
                            
                            midCol=images.shape[2]//2

                            if key in patientBoundBoxCol and (patientBoundBoxCol[key][0]<midCol and patientBoundBoxCol[key][1]<midCol) or (patientBoundBoxCol[key][0]>midCol and patientBoundBoxCol[key][1]>midCol): #not in middle
                                grp=patientData.create_group(key)
                                #xs,ys,zs = np.where(images>lowerBound)  
                                if len(rowOrientation)>0 and rowOrientation[0]==-1: # flip rows
                                    images = images[:,::-1,:]
                                    grp.attrs['flipRows']=True
                                else:
                                    grp.attrs['flipRows']=False
                                if len(zCords)==2: ## do this after L,R bounding box 
                                    if zCords[0]-zCords[1]>0:
                                        images=images[::-1,:,:] #standardize top-down slice orientation
                                        grp.attrs['flipZ']=True
                                    else:
                                        grp.attrs['flipZ']=False
                                else:
                                    print(f"zCords!=2 for {patientDir}")
                                
                                imageL=images[:,:,:midCol]
                                imageR=images[:,:,midCol:]
                                imageL=imageL[:,:,::-1] #this should normalize patient midline orientation
                                if normalizeLR:
                                    imageL=normalizeFn(imageL)
                                    imageR=normalizeFn(imageR)
                                imageLR=[imageL,imageR]
                                for im in range(len(imageLR)):
                                    xs,ys,zs = np.where(imageLR[im]>lowerBound)
                                    boundingBox=[[np.min(xs),np.max(xs)+1],[np.min(ys),np.max(ys)+1],[np.min(zs),np.max(zs)+1]]
                                    boundBoxFinals.append([bb[1]-bb[0] for bb in boundingBox])
                                    imageLR[im]=imageLR[im][boundingBox[0][0]:boundingBox[0][1],boundingBox[1][0]:boundingBox[1][1],boundingBox[2][0]:boundingBox[2][1]]

                                grp.create_dataset('L',data=imageLR[0],compression="gzip", compression_opts=6)
                                grp.create_dataset('R',data=imageLR[1],compression="gzip", compression_opts=6)
                                ### check if L or R, then apply grp.attrs['LRflag']=[1,0] or [0,1]
                                if (patientBoundBoxCol[key][0]<midCol and patientBoundBoxCol[key][1]<midCol):
                                    grp.attrs['LRflag']=np.array([1,0])
                                else:
                                    grp.attrs['LRflag']=np.array([0,1])
                            else:
                                midOverlapKeys.add(key)

                        else:
                            print(f"max==0 for {patientDir}")
                            images=[]

            else:
                print(f"multiple scanTypes for patient {patientDir}")
        else:
            print(f'skipping {patientDir}')
boundBoxFinals=np.asarray(boundBoxFinals)
resizeAvg=boundBoxFinals.mean(axis=0)
print(resizeAvg)

  im = imageio.imread(f'{filePath}')


[151.   455.25 239.25]


In [None]:
### now let's resize the data and save to one final h5 file for the pipeline

##resize
#imgSize=resizeAvg ## retrieved from first read block or rounded average of grp[shape]
imgSize=[173, 388, 238]
downSample=3.2
imgSize=np.round(imgSize/downSample)
transform=Compose([Resize(spatial_size=imgSize)])


#checkKeys=['18', '19', '20', '21', '25', '26', '29']


lowerBound=.15
inputData=f'data/FP923_LRsplit_crop_point{int(lowerBound*100)}Thresh.h5'
outputFile=f'data/FP923_LR_avgCrop_DS{int(downSample*10)}_point{int(lowerBound*100)}Thresh.h5'

with h5py.File(f'{outputFile}','w') as patientData:
    with h5py.File(inputData, 'r') as f:
        for key in set(f.keys()):
            grp=patientData.create_group(key)
            grp.attrs['LRflag']=f[key].attrs['LRflag']
            grp.attrs['flipRows']=f[key].attrs['flipRows']
            grp.attrs['flipZ']=f[key].attrs['flipZ']
            for subKey in ['L','R']:
                arr=f[key][subKey][:]
                volume=torch.tensor(arr).unsqueeze(0)
                volume=transform(volume)
                images=volume[0].numpy().astype(np.float16)
                images=(images-images.min())/(images.max()-images.min())
                grp.create_dataset(subKey,data=images,compression="gzip", compression_opts=6)


In [None]:
#173 388 238

In [13]:
len(patientBoundBoxCol.keys())

922