In [1]:
import argparse
import os
import math
import numpy as np
import SimpleITK as sitk
import random
import pandas as pd
from typing import Tuple, Dict

import torch
from torch import Tensor
import torch.optim
import torch.utils.data
import torchvision
import torchvision.transforms as transforms

from monai.transforms import (
    Compose,
    Resize,
    RandRotate,
    Affine,
    RandGaussianNoise,
    RandZoom,
    RepeatChannel,
)

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import joblib
import h5py

In [2]:
yflags=pd.read_csv("../duke/ClinicalFlags.csv",index_col=0)
yflags.columns

Index(['StagingNodes'], dtype='object')

In [15]:
yflags.index

Index(['Breast_MRI_001', 'Breast_MRI_002', 'Breast_MRI_003', 'Breast_MRI_004',
       'Breast_MRI_005', 'Breast_MRI_006', 'Breast_MRI_007', 'Breast_MRI_008',
       'Breast_MRI_009', 'Breast_MRI_010',
       ...
       'Breast_MRI_913', 'Breast_MRI_914', 'Breast_MRI_915', 'Breast_MRI_916',
       'Breast_MRI_917', 'Breast_MRI_918', 'Breast_MRI_919', 'Breast_MRI_920',
       'Breast_MRI_921', 'Breast_MRI_922'],
      dtype='object', name='Patient ID', length=922)

In [55]:
type(yflags['StagingNodes']['Breast_MRI_014'])

numpy.float64

In [51]:

np.isnan(yflags['StagingNodes']['Breast_MRI_016'])

True

In [12]:
[i+1 for i in range(len(yflags['StagingNodes'])) if np.isnan(yflags['StagingNodes'][i])]

[16,
 31,
 63,
 65,
 80,
 122,
 140,
 238,
 322,
 386,
 394,
 401,
 406,
 535,
 574,
 637,
 646,
 674,
 704,
 743,
 814,
 861,
 876,
 906]

In [34]:
patientData=[]
patientIndices=[]
patientDimensions=[]

inputData='data/firstpass_923_cropped.h5'


with h5py.File(inputData, 'r') as f:
    # Iterate over the groups in the file
    for key in f.keys():
        #print(f"key\t{key}")
        patientIndices.append(int(key))
        # Get the group
        grp = f[key]
        # Get the array dimensions
        shape = grp.attrs['shape']
        patientDimensions.append(shape)
        # Read the array data (decompression happens automatically)
        
        #arr = grp['data'][:]

        #patientData.append(grp['data'][:])
        # Use arr for further processing
patientDimensions=np.asarray(patientDimensions)


In [35]:
avgShape=patientDimensions.mean(axis=0)
imgShape=np.round(avgShape)
imgShape

array([171., 471., 483.])

In [39]:
###try resize+resave for compression


inputData='data/firstpass_923.h5'
hfile=h5py.File(inputData,"r")


In [40]:
arr=hfile['21']['data'][:]

In [41]:
hfile.close()

In [22]:
arr.shape

(192, 512, 512)

In [15]:
xs,ys,zs = np.where(arr!=0) 


In [21]:
boundingBox=[[np.min(xs),np.max(xs)+1],[np.min(ys),np.max(ys)+1],[np.min(zs),np.max(zs)+1]]
boundingBox

[[4, 192], [0, 466], [0, 512]]

In [13]:
### first do zero slicing, 
boundingBoxes=[]

for i in arr:
    xs,ys= np.where(i!=0) 
    boundingBoxes.append([[min(xs),max(xs)+1],[min(ys),max(ys)+1]])
    

#then we can resize and such


In [4]:
### go through, grab the bounding boxes -- crop and save h5
boundBoxFinals=[]

inputData='data/firstpass_923.h5'
outputFile='data/firstpass_testCrop_03thresh.h5'
lowerBound=.03
checkKeys=[19, 20, 30, 21, 29, 25, 26, 18]


with h5py.File(f'{outputFile}','w') as patientData:
    with h5py.File(inputData, 'r') as f:
        for key in set(f.keys()) & {str(k) for k in checkKeys}:
            
            arr=f[key]['data'][:]
            #xs,ys,zs = np.where(arr!=0)
            xs,ys,zs = np.where(arr>lowerBound)  
            boundingBox=boundingBox=[[np.min(xs),np.max(xs)+1],[np.min(ys),np.max(ys)+1],[np.min(zs),np.max(zs)+1]]
            boundBoxFinals.append(boundingBox)

            grp=patientData.create_group(key)
            result=arr[boundingBox[0][0]:boundingBox[0][1],boundingBox[1][0]:boundingBox[1][1],boundingBox[2][0]:boundingBox[2][1]]
            grp.attrs['shape']=result.shape
            grp.create_dataset('data',data=result.astype(np.float16),compression="gzip", compression_opts=4)


In [20]:
### go through, grab the bounding boxes -- crop and save h5
boundBoxFinals=[]

inputData='data/firstpass_923.h5'

lowerBound=.15
outputFile=f'data/firstpass_923_cropped_point{lowerBound*100}Thresh.h5'
checkKeys=[19, 20, 30, 21, 29, 25, 26, 18]


with h5py.File(f'{outputFile}','w') as patientData:
    with h5py.File(inputData, 'r') as f:
        for key in f.keys():
            
            arr=f[key]['data'][:]
            #xs,ys,zs = np.where(arr!=0)
            xs,ys,zs = np.where(arr>lowerBound)  
            boundingBox=boundingBox=[[np.min(xs),np.max(xs)+1],[np.min(ys),np.max(ys)+1],[np.min(zs),np.max(zs)+1]]
            boundBoxFinals.append(boundingBox)

            grp=patientData.create_group(key)
            result=arr[boundingBox[0][0]:boundingBox[0][1],boundingBox[1][0]:boundingBox[1][1],boundingBox[2][0]:boundingBox[2][1]]
            grp.attrs['shape']=result.shape
            grp.create_dataset('data',data=result.astype(np.float16),compression="gzip", compression_opts=4)


### resizing

In [21]:
##resize
boundBoxFinals=np.asarray(boundBoxFinals)
avgShape=boundBoxFinals.mean(axis=0)
imgSize=np.round(avgShape)
transform=Compose([Resize(spatial_size=imgSize)])

In [8]:
imgSize

array([[  2., 169.],
       [ 56., 414.],
       [  8., 479.]])

In [22]:
imgSize#new

array([[  4., 167.],
       [ 73., 407.],
       [ 12., 476.]])

In [11]:
resizeAvg=imgSize[:,1]-imgSize[:,0]
resizeAvg

array([167., 358., 471.])

In [23]:
resizeAvg=imgSize[:,1]-imgSize[:,0]
resizeAvg #new

array([163., 334., 464.])

In [33]:
imgSize ## old imgSize -> [0,171 , 5-476 , 2-485]

array([[  0., 171.],
       [  5., 476.],
       [  2., 485.]])

In [2]:
### now let's resize the data and save to one final h5 file for the pipeline

##resize
imgSize=np.array([163., 334., 464.]) ## retrieved from first read block or rounded average of grp[shape]
downSample=2.0
imgSize=np.round(imgSize/downSample)

#transform=Compose([Resize(spatial_size=imgSize)])
transform=Compose([Resize(spatial_size=[23,47,66])]) 

#outputFile=f'data/firstpass_923_cropped_point{lowerBound*100}Thresh.h5'
#firstN=10
checkKeys=[19, 20, 30, 21, 29, 25, 26, 18]

lowerBound=.20
inputData=f'data/firstpass_923_cropped_point{lowerBound*100}Thresh.h5'
#outputFile=f'data/firstpass_923_avgCropResize_DS{int(downSample*10)}_point{int(lowerBound*100)}Thresh_subSample.h5'
outputFile=f'data/firstpass_923_avgCropResize_DS70_point{int(lowerBound*100)}Thresh_subSample.h5'

with h5py.File(f'{outputFile}','w') as patientData:
    with h5py.File(inputData, 'r') as f:
        for key in set(f.keys()) & {str(k) for k in checkKeys}:
            arr=f[key]['data'][:]
            volume=torch.tensor(arr).unsqueeze(0)
            volume=transform(volume)
            grp=patientData.create_group(key)
            grp.attrs['shape']=imgSize
            images=volume[:].numpy().astype(np.float16)
            images=(images-images.min())/(images.max()-images.min())

            grp.create_dataset('data',data=images.astype(np.float16),compression="gzip", compression_opts=6)



In [2]:
### now let's resize the data and save to one final h5 file for the pipeline

##resize
imgSize=np.array([163., 334., 464.]) ## retrieved from first read block or rounded average of grp[shape]
downSample=3.0
imgSize=np.round(imgSize/downSample)

transform=Compose([Resize(spatial_size=imgSize)])
#transform=Compose([Resize(spatial_size=[23,47,66])]) 

#outputFile=f'data/firstpass_923_cropped_point{lowerBound*100}Thresh.h5'
#firstN=10
checkKeys=[19, 20, 30, 21, 29, 25, 26, 18]

lowerBound=.20
inputData=f'data/firstpass_923_cropped_point{lowerBound*100}Thresh.h5'
#outputFile=f'data/firstpass_923_avgCropResize_DS{int(downSample*10)}_point{int(lowerBound*100)}Thresh_subSample.h5'
outputFile=f'data/firstpass_923_avgCropResize_DS{downSample*10}_point{int(lowerBound*100)}Thresh.h5'

with h5py.File(f'{outputFile}','w') as patientData:
    with h5py.File(inputData, 'r') as f:
        for key in f.keys():
            arr=f[key]['data'][:]
            volume=torch.tensor(arr).unsqueeze(0)
            volume=transform(volume)
            grp=patientData.create_group(key)
            grp.attrs['shape']=imgSize
            images=volume[:].numpy().astype(np.float16)
            images=(images-images.min())/(images.max()-images.min())

            grp.create_dataset('data',data=images.astype(np.float16),compression="gzip", compression_opts=6)



In [28]:
### now let's resize the data and save to one final h5 file for the pipeline

##resize
imgSize=np.array([167., 358., 471.]) ## retrieved from first read block or rounded average of grp[shape]
downSample=2.0
imgSize=np.round(imgSize/downSample)

transform=Compose([Resize(spatial_size=imgSize)])


#outputFile=f'data/firstpass_923_cropped_point{lowerBound*100}Thresh.h5'
#firstN=10
checkKeys=[19, 20, 30, 21, 29, 25, 26, 18]

lowerBound=.15
inputData=f'data/firstpass_923_cropped_point{lowerBound*100}Thresh.h5'
outputFile=f'data/firstpass_923_avgCropResize_DS{int(downSample*10)}_point{int(lowerBound*100)}Thresh.h5'

with h5py.File(f'{outputFile}','w') as patientData:
    with h5py.File(inputData, 'r') as f:
        for key in f.keys() :
            arr=f[key]['data'][:]
            volume=torch.tensor(arr).unsqueeze(0)
            volume=transform(volume)
            grp=patientData.create_group(key)
            grp.attrs['shape']=imgSize
            images=volume[:].numpy().astype(np.float16)
            images=(images-images.min())/(images.max()-images.min())

            grp.create_dataset('data',data=images.astype(np.float16),compression="gzip", compression_opts=6)



In [29]:
### now let's resize the data and save to one final h5 file for the pipeline

##resize
imgSize=np.array([163., 334., 464.]) ## retrieved from first read block or rounded average of grp[shape]
downSample=2.0
imgSize=np.round(imgSize/downSample)

transform=Compose([Resize(spatial_size=imgSize)])


#outputFile=f'data/firstpass_923_cropped_point{lowerBound*100}Thresh.h5'
#firstN=10
checkKeys=[19, 20, 30, 21, 29, 25, 26, 18]

lowerBound=.20
inputData=f'data/firstpass_923_cropped_point{lowerBound*100}Thresh.h5'
outputFile=f'data/firstpass_923_avgCropResize_DS{int(downSample*10)}_point{int(lowerBound*100)}Thresh.h5'

with h5py.File(f'{outputFile}','w') as patientData:
    with h5py.File(inputData, 'r') as f:
        for key in f.keys() :
            arr=f[key]['data'][:]
            volume=torch.tensor(arr).unsqueeze(0)
            volume=transform(volume)
            grp=patientData.create_group(key)
            grp.attrs['shape']=imgSize
            images=volume[:].numpy().astype(np.float16)
            images=(images-images.min())/(images.max()-images.min())

            grp.create_dataset('data',data=images.astype(np.float16),compression="gzip", compression_opts=6)


In [15]:

with h5py.File(outputFile, 'r') as f:
    arr=f['19']['data'][:]

In [16]:
arr.shape

(1, 83, 179, 235)

In [48]:
### now let's resize the data and save to one final h5 file for the pipeline

##resize
imgSize=[171,471,483] ## retrieved from first read block or rounded average of grp[shape]
transform=Compose([Resize(spatial_size=imgSize)])


#firstN=10
checkKeys=[19, 20, 30, 21, 29, 25, 26, 18]


inputData='data/firstpass_923_cropped.h5'
outputFile=f'data/firstpass_923_avgCropResize.h5'

with h5py.File(f'{outputFile}','w') as patientData:
    with h5py.File(inputData, 'r') as f:
        for key in set(f.keys()):
            arr=f[key]['data'][:]
            volume=torch.tensor(arr).unsqueeze(0)
            volume=transform(volume)
            grp=patientData.create_group(key)
            grp.attrs['shape']=imgSize
            images=volume[:].numpy().astype(np.float16)
            images=(images-images.min())/(images.max()-images.min())

            grp.create_dataset('data',data=images.astype(np.float16),compression="gzip", compression_opts=6)



In [49]:
key

'597'

In [42]:
arr.shape

(176, 512, 512)

In [44]:
imgSize

array([[  0., 171.],
       [  5., 476.],
       [  2., 485.]])

In [43]:
transform(volume)

RuntimeError: applying transform <monai.transforms.spatial.array.Resize object at 0x000001C0E3295A90>

In [33]:
a=[1,2,3]
np.random.shuffle(a)
a

[2, 1, 3]

In [39]:
### let's do k-fold samples and just store 


def kFold_TrainTestSplit(arr,k:int,testP=.1,shuffle=True):
    assert len(arr)>1 and testP<=1
    folds=[] # [([...Tr_k...],[...Te_k...])]
    testSize=round(testP*len(arr))
    
    for i in range(k):
        testBin=list(np.random.choice(arr,size=testSize,replace=False,))
        trainBin=list(set(arr)-set(testBin))
        if shuffle:
            np.random.shuffle(testBin)
            np.random.shuffle(trainBin)
        folds.append({"train":trainBin,"test":testBin})

    return folds

def gen_kFold_TrainTestSplit(arr,k,trP,fileName="kFoldGen.sav"):
    folds=kFold_TrainTestSplit(arr,k,trP)
    joblib.dump(folds,fileName)

def load_kFold_TrainTestSplit(fileName="kFoldGen.sav"):
    """
    returns a list of size k:
    [{"train":[tr_k],"test":[te_k]},...]
    """
    
    return joblib.load(fileName)

In [35]:
arr=[int(i.split("_")[-1]) for i in yflags.index]
k=5
trP=.8
folds=kFold_TrainTestSplit(arr,k,trP)


In [36]:
folds[1]['train']

[503,
 359,
 241,
 8,
 358,
 589,
 222,
 460,
 838,
 843,
 510,
 718,
 831,
 255,
 613,
 276,
 320,
 259,
 292,
 11,
 727,
 570,
 475,
 92,
 904,
 51,
 192,
 286,
 190,
 498,
 794,
 451,
 429,
 744,
 24,
 487,
 868,
 733,
 213,
 324,
 64,
 80,
 670,
 906,
 913,
 578,
 84,
 137,
 731,
 644,
 177,
 619,
 146,
 754,
 119,
 171,
 792,
 357,
 550,
 120,
 147,
 604,
 333,
 797,
 561,
 698,
 207,
 894,
 469,
 618,
 138,
 430,
 655,
 634,
 38,
 339,
 149,
 919,
 809,
 784,
 688,
 226,
 194,
 53,
 855,
 737,
 321,
 785,
 679,
 375,
 351,
 159,
 220,
 354,
 87,
 872,
 424,
 649,
 41,
 810,
 743,
 710,
 422,
 114,
 389,
 630,
 291,
 862,
 158,
 385,
 769,
 314,
 876,
 676,
 157,
 645,
 738,
 326,
 828,
 23,
 911,
 827,
 221,
 572,
 135,
 593,
 605,
 100,
 262,
 563,
 842,
 340,
 132,
 25,
 322,
 646,
 341,
 180,
 699,
 250,
 150,
 559,
 562,
 471,
 753,
 270,
 607,
 69,
 365,
 77,
 793,
 590,
 170,
 443,
 142,
 232,
 689,
 878,
 323,
 305,
 757,
 26,
 373,
 345,
 480,
 94,
 780,
 386,
 547,
 621,

In [17]:
[int(i.split("_")[-1]) for i in yflags.index]

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [None]:
### let's do k-fold samples and just store 


def kFold_TrainTestSplit(arr,k:int,testP=.1,shuffle=True):
    assert len(arr)>1 and testP<=1
    folds=[] # [([...Tr_k...],[...Te_k...])]
    testSize=round(testP*len(arr))
    
    for i in range(k):
        testBin=list(np.random.choice(arr,size=testSize,replace=False,))
        trainBin=list(set(arr)-set(testBin))
        if shuffle:
            np.random.shuffle(testBin)
            np.random.shuffle(trainBin)
        folds.append({"train":trainBin,"test":testBin})

    return folds

def gen_kFold_TrainTestSplit(arr,k,trP,fileName="kFoldGen.sav"):
    folds=kFold_TrainTestSplit(arr,k,trP)
    joblib.dump(folds,fileName)

def load_kFold_TrainTestSplit(fileName="kFoldGen.sav"):
    """
    returns a list of size k:
    [{"train":[tr_k],"test":[te_k]},...]
    """
    
    return joblib.load(fileName)

In [129]:
cuda=torch.cuda.is_available()



pretrain_batchsize = 5
channels=3
# Data augmentation (on-the-fly) parameters
aug_prob = 1
rand_rot = 10                       # random rotation range [deg]
rand_rot_rad = rand_rot*math.pi/180 # random rotation range [rad]
rand_noise_std = 0.01               # std random Gaussian noise
rand_shift = 5                      # px random shift
min_zoom = 0.9
max_zoom = 1.1


useGPU=True
batchSize=25




def kFold_TrainTestSplit(arr,k:int,testP=.1,shuffle=True):
    assert len(arr)>1 and testP<=1
    folds=[] # [([...Tr_k...],[...Te_k...])]
    testSize=round(testP*len(arr))
    
    for i in range(k):
        testBin=list(np.random.choice(arr,size=testSize,replace=False,))
        trainBin=list(set(arr)-set(testBin))
        if shuffle:
            np.random.shuffle(testBin)
            np.random.shuffle(trainBin)
        folds.append({"train":trainBin,"test":testBin})

    return folds

def gen_kFold_TrainTestSplit(arr,k,trP,fileName="kFoldGen.sav"):
    folds=kFold_TrainTestSplit(arr,k,trP)
    joblib.dump(folds,fileName)

def load_kFold_TrainTestSplit(fileName="kFoldGen.sav"):
    """
    returns a list of size k:
    [{"train":[tr_k],"test":[te_k]},...]
    """
    
    return joblib.load(fileName)



"""
transforms_dic = {
    'train': Compose([
        RandRotate(range_x=rand_rot_rad, 
                    range_y=rand_rot_rad, 
                    range_z=rand_rot_rad, 
                    prob=aug_prob),
        RandGaussianNoise(std=rand_noise_std, prob=aug_prob),
        Affine(translate_params=(rand_shift,
                                    rand_shift,
                                    rand_shift), 
                image_only=True),
        RandZoom(min_zoom=min_zoom, max_zoom=max_zoom, prob=aug_prob),
        RepeatChannel(repeats=channels),
    ]),
    'train_noaug': None,
    'project_noaug':None,
    'val': None,
    'test': None,
    'test_projection': None,
}"""


transforms_dic = {
    'train': Compose([
        RandRotate(range_x=rand_rot_rad, 
                    range_y=rand_rot_rad, 
                    range_z=rand_rot_rad, 
                    prob=aug_prob),
        RandGaussianNoise(std=rand_noise_std, prob=aug_prob),
        Affine(translate_params=(rand_shift,
                                    rand_shift,
                                    rand_shift), 
                image_only=True),
        RandZoom(min_zoom=min_zoom, max_zoom=max_zoom, prob=aug_prob),
        RepeatChannel(repeats=channels),
    ]),
    'train_noaug': Compose([RepeatChannel(repeats=channels)]),
    'project_noaug':Compose([RepeatChannel(repeats=channels)]),
    'val': Compose([RepeatChannel(repeats=channels)]),
    'test': Compose([RepeatChannel(repeats=channels)]),
    'test_projection': Compose([RepeatChannel(repeats=channels)]),
}



class AugSupervisedDataset(torch.utils.data.Dataset):

    def __init__(self,
                 dataset_h5path,
                 yflag_dict,
                 subsetKeys=None,
                 transform = None,

                 ):
        self.dataset_path=dataset_h5path

        self.yflag_dict=yflag_dict
        if subsetKeys:
            self.subsetKeys=subsetKeys
        else:
            with h5py.File(self.dataset_path, 'r') as f:
                self.subsetKeys=list(f.keys())
                self.subsetKeys.sort()
        
        #if transform is None:
        #    self.transform = lambda x: x
        #else:
        self.transform=transform
        
    def __len__(self):
        if hasattr(self.subsetKeys, '__len__'):
            return len(self.subsetKeys)
        elif self.subsetKeys is None:
            length=0
            with h5py.File(self.dataset_path, 'r') as f:
                length=len(f.keys)
            return length
        

    def __getitem__(self,idx):
        id=self.subsetKeys[idx]
        label=self.yflag_dict[f"Breast_MRI_{str(id).zfill(3)}"]
        with h5py.File(self.dataset_path, 'r') as f:
            image=f[str(id)]['data'][:]
        
        volume = torch.tensor(image) # torch.Size([160, 229, 193])
        #volume = torch.unsqueeze(volume, 0) # add channel dimension
        #volume = volume.float()
        
        if self.transform:
            volume = self.transform(volume)
            img_min = volume.min()
            img_max = volume.max()
            volume = (volume-img_min)/(img_max-img_min)

        return volume, label

    
        

class TwoAugSelfSupervisedDataset(torch.utils.data.Dataset):

    def __init__(self,
                 dataset_h5path,
                 yflag_dict,
                 subsetKeys=None,
                 transform = None,

                 ):
        self.dataset_path=dataset_h5path

        self.yflag_dict=yflag_dict
        if subsetKeys:
            self.subsetKeys=subsetKeys
        else:
            with h5py.File(self.dataset_path, 'r') as f:
                self.subsetKeys=list(f.keys())
                self.subsetKeys.sort()
        
        #if transform is None:
        #    self.transform = lambda x: x
        #else:
        self.transform=transform
        
    def __len__(self):
        if hasattr(self.subsetKeys, '__len__'):
            return len(self.subsetKeys)
        elif self.subsetKeys is None:
            length=0
            with h5py.File(self.dataset_path, 'r') as f:
                length=len(f.keys)
            return length
        

    def __getitem__(self,idx):
        id=self.subsetKeys[idx]
        label=self.yflag_dict[f"Breast_MRI_{str(id).zfill(3)}"]
        with h5py.File(self.dataset_path, 'r') as f:
            image=f[str(id)]['data'][:]
        
        volume = torch.tensor(image) # torch.Size([160, 229, 193])
        #volume = torch.unsqueeze(volume, 0) # add channel dimension
        #volume = volume.float()
        volumes=[]

        if self.transform:
            for i in range(2):
                newVolume = self.transform(volume)
                img_min = newVolume.min()
                img_max = newVolume.max()
                newVolume  = (newVolume -img_min)/(img_max-img_min)
                volumes.append(newVolume)
        else:
            volumes=[volume,volume]
            

        return volumes[0],volumes[1], label


        

def construct_data(
        dataset_h5path,
        yflag_df,
        yLabelColumn='StagingNodes',
        k_fold = 5,
        test_p=.2,
        val_p=.05,
        seed=42,
    ):
    """
    k-fold and returns dataset classes for 
    trainset, trainset_pretraining, trainset_normal, trainset_normal_augment, projectset, valset, testset, testset_projection 
    """
    patientNums=[int(i.split("_")[-1]) for i in yflag_df.index] #following Breast_MRI_"i".zfill(3) convention
    validPatients=[i for i in patientNums if not np.isnan(yflag_df[yLabelColumn][f"Breast_MRI_{str(i).zfill(3)}"])]
    with h5py.File(dataset_h5path, 'r') as f:
        validPatients=[i for i in validPatients if str(i) in f.keys()]
    
    np.random.seed(seed)

    #not using below right.. implementation of 1 of k folds.

    trainTestFolds=kFold_TrainTestSplit(validPatients,k=k_fold,testP=test_p)[0]
    trainValFolds=kFold_TrainTestSplit(trainTestFolds['train'],k=k_fold,testP=val_p/(1-test_p))[0]
    trainValFolds={'train':trainValFolds['train'],'val':trainValFolds['test']}
    trainTestFolds.update(trainValFolds)

    folds=trainTestFolds # keys 'train' 'test' 'val'
    #
    # modify yflag_dict expression here if you want to modify the way we define classification
    yflag_dict={ind:yflag_df[yLabelColumn][ind]>=1 for ind in yflag_df.index}

    trainset = TwoAugSelfSupervisedDataset(
        dataset_h5path,
                 yflag_dict,
                 subsetKeys=folds['train'],
                 transform = transforms_dic['train'],
        )
    trainset_pretraining = TwoAugSelfSupervisedDataset(
        dataset_h5path,
                 yflag_dict,
                 subsetKeys=folds['train'],
                 transform = transforms_dic['train'],
        )
    trainset_normal = AugSupervisedDataset(
        dataset_h5path,
                 yflag_dict,
                 subsetKeys=folds['train'],
                 transform = transforms_dic['train_noaug'],
        )
    trainset_normal_augment = AugSupervisedDataset(
        dataset_h5path,
                 yflag_dict,
                 subsetKeys=folds['train'],
                 transform = transforms_dic['train'],
        )
    projectset = AugSupervisedDataset(
        dataset_h5path,
                 yflag_dict,
                 subsetKeys=folds['train'],
                 transform = transforms_dic['project_noaug'],
        )
    valset = AugSupervisedDataset(
        dataset_h5path,
                 yflag_dict,
                 subsetKeys=folds['val'],
                 transform = transforms_dic['val'],
        )
    testset = AugSupervisedDataset(
        dataset_h5path,
                 yflag_dict,
                 subsetKeys=folds['test'],
                 transform = transforms_dic['test'],
        )
    testset_projection = AugSupervisedDataset(
        dataset_h5path,
                 yflag_dict,
                 subsetKeys=folds['test'],
                 transform = transforms_dic['test_projection'],
        )
    return trainset, trainset_pretraining, trainset_normal, trainset_normal_augment, projectset, valset, testset, testset_projection 



def get_dataloaders(dataset_h5path,
        yflag_df,
        yLabelColumn='StagingNodes',
        k_fold = 5,
        test_p=.2,
        val_p=.05,
        batchSize=25,
        num_workers=1,
        seed=42,):
    """
    calls get_data and returns DataLoaders
    """
    trainset, trainset_pretraining, trainset_normal, trainset_normal_augment, projectset, valset, testset, testset_projection = construct_data(dataset_h5path,
        yflag_df,
        yLabelColumn=yLabelColumn,
        k_fold = k_fold,
        test_p=test_p,
        val_p=val_p,
        seed=seed,)
    
    usePins= useGPU and torch.cuda.is_available()
    to_shuffle = True
    sampler = None


    pretrain_batchsize = batchSize
    
    trainloader = torch.utils.data.DataLoader(
        dataset = trainset,
        batch_size = batchSize,
        shuffle = to_shuffle,
        sampler = sampler,
        pin_memory = usePins,
        num_workers = num_workers,
        worker_init_fn = np.random.seed(seed),
        drop_last = True)
           
    trainloader_pretraining = torch.utils.data.DataLoader(
        dataset = trainset_pretraining,
        batch_size = pretrain_batchsize,
        shuffle = to_shuffle,
        sampler = sampler,
        pin_memory = usePins,
        num_workers = num_workers,
        worker_init_fn = np.random.seed(seed),
        drop_last = True)
    
    trainloader_normal = torch.utils.data.DataLoader(
        dataset = trainset_normal,
        batch_size = batchSize,
        shuffle = False, 
        sampler = sampler,
        pin_memory = usePins,
        num_workers = num_workers,
        worker_init_fn = np.random.seed(seed),
        drop_last = True)
        
    trainloader_normal_augment = torch.utils.data.DataLoader(
        dataset = trainset_normal_augment,
        batch_size = batchSize,
        shuffle = to_shuffle,
        sampler = sampler,
        pin_memory = usePins,
        num_workers = num_workers,
        worker_init_fn = np.random.seed(seed),
        drop_last = True)
    
    projectloader = torch.utils.data.DataLoader(
        dataset = projectset,
        batch_size = 1,
        shuffle = False, 
        sampler = sampler,
        pin_memory = usePins,
        num_workers = num_workers,
        worker_init_fn = np.random.seed(seed),
        drop_last = True)
    
    valloader = torch.utils.data.DataLoader(
        dataset = valset,
        batch_size = 1,
        shuffle = True, 
        pin_memory = usePins,
        num_workers = num_workers,                
        worker_init_fn = np.random.seed(seed),
        drop_last = False)

    testloader = torch.utils.data.DataLoader(
        dataset = testset,
        batch_size = 1,
        shuffle = False, 
        pin_memory = usePins,
        num_workers = num_workers,                
        worker_init_fn = np.random.seed(seed),
        drop_last = False)
    
    test_projectloader = torch.utils.data.DataLoader(
        dataset = testset_projection,
        batch_size = 1,
        shuffle = False, 
        pin_memory = usePins,
        num_workers = num_workers,                
        worker_init_fn = np.random.seed(seed),
        drop_last = False)

    return trainloader, trainloader_pretraining, trainloader_normal, trainloader_normal_augment, projectloader, valloader, testloader, test_projectloader



In [136]:
yflags=pd.read_csv("../duke/ClinicalFlags.csv",index_col=0)


dataloaders=get_dataloaders(dataset_h5path='data/firstpass_923_avgCropResize.h5',
                            yflag_df=yflags,
                            yLabelColumn='StagingNodes',
                            k_fold=5,
                            test_p=.2,
                            val_p=.05,
                            batchSize=25,)

In [137]:
trainset, trainset_pretraining, trainset_normal, trainset_normal_augment, projectset, valset, testset, testset_projection = construct_data(
                            dataset_h5path='data/firstpass_923_avgCropResize.h5',
                            yflag_df=yflags,
                            yLabelColumn='StagingNodes',
                            k_fold=5,
                            test_p=.2,
                            val_p=.05,
                            )



In [117]:
trainset_normal_augment.transform()

RuntimeError: applying transform <monai.transforms.spatial.array.RandRotate object at 0x000001C0E3464A10>

In [138]:
dat=trainset[0][0][0] #([tensors in batch],[labels in batch]) ### according to __getitem__

In [127]:
id=trainset.subsetKeys[2]
label = trainset.yflag_dict[f"Breast_MRI_{str(id).zfill(3)}"]
with h5py.File(trainset.dataset_path, 'r') as f:
    image=f[str(id)]['data'][:]
volume = torch.tensor(image) # torch.Size([160, 229, 193])
#volume = torch.unsqueeze(volume, 0) # add channel dimension
if trainset.transform:
    volume = trainset.transform(volume)
    img_min = volume.min()
    img_max = volume.max()
    volume = (volume-img_min)/(img_max-img_min)

In [120]:
morph=trainset.transform(dat)

metatensor(0.7750)

In [68]:
trainDataLoader=dataloaders[0]

In [70]:
trainDataLoader

TypeError: 'DataLoader' object is not subscriptable

In [6]:
volume=torch.tensor(patientData[0]).unsqueeze(0)

In [7]:
testTransform=transforms_dic['train'](volume)

In [8]:
volume.shape,testTransform.shape

(torch.Size([1, 142, 512, 512]), torch.Size([1, 162, 488, 488]))

In [62]:
yflags['StagingNodes']["Breast_MRI_001"]

1.0