In [33]:
import pandas as pd
import numpy as np
from typing import Sequence
from pathlib import Path
import os
import functools 
from shutil import copyfile
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [34]:
PATH = Path('/home/chens/practicum/')

## 1. filter records

In [35]:
def filter_records(df,treatment_range=None,date_range=None,primary_use_list=None,age_range=None,met_range=None,return_list = False,max_volume = None,min_volume = None) -> Sequence[str]:
    
    """
    
    filter the medical records based on 
        1. the number of treatments
        2. first treatment date
        3. the primary use, age and number of mets on the first treatment
    
    return a list of filenames if the parameter 'return_list' is True, otherwise return a DataFrame
    
    """

    # Treatment count
    droped_duplicates_date = df[['PiCare PatientID','StudyDateAnon']].drop_duplicates().dropna()
    treat_count = droped_duplicates_date.groupby(['PiCare PatientID']).agg({'StudyDateAnon':'count'}).reset_index().rename(columns={'StudyDateAnon':'Treatment Count'})
    print(treat_count[['PiCare PatientID']].drop_duplicates().shape)
    filtered_treatment_num = treat_count[treat_count['Treatment Count'].between(treatment_range[0],treatment_range[1])]
    filtered_df = filtered_treatment_num.merge(df,on='PiCare PatientID',how='left')
    print(filtered_df[['PiCare PatientID']].drop_duplicates().shape)
    
    # First Treatment Date
    first_treatment_date = filtered_df.groupby(['PiCare PatientID']).agg({'StudyDateAnon':'min'}).reset_index()
    filtered_date = first_treatment_date[first_treatment_date['StudyDateAnon'].between(date_range[0],date_range[1])]
    filtered_df = filtered_date.merge(filtered_df,on=['PiCare PatientID','StudyDateAnon'],how='left')[['PiCare PatientID',
                                                                                          'StudyDateAnon',
                                                                                          'Primary tumor Site',
                                                                                          'Age primary diag','Target volume']].drop_duplicates()
    # Max sizes
    max_target_volume = filtered_df.groupby(['PiCare PatientID','StudyDateAnon']).agg({'Target volume':'max'}).reset_index()
    filtered_df = filtered_df.merge(max_target_volume[(max_target_volume['Target volume']>=min_volume)&(max_target_volume['Target volume']<max_volume)][['PiCare PatientID']],on = 'PiCare PatientID',how = 'right')
    print(filtered_df[['PiCare PatientID']].drop_duplicates().shape)
    # Primary Use on First Treatment
    filtered_df = filtered_df[filtered_df['Primary tumor Site'].isin(primary_use_list)]
    print(filtered_df[['PiCare PatientID']].drop_duplicates().shape)
    # Age on First Treatment 
    filtered_df = filtered_df[filtered_df['Age primary diag'].between(age_range[0],age_range[1])]
    print(filtered_df[['PiCare PatientID']].drop_duplicates().shape)
    # Number of mets on First Treatment
    num_of_mets = filtered_df.merge(df,on=['PiCare PatientID','StudyDateAnon','Primary tumor Site','Age primary diag'],how='left').assign(const=1).groupby(['PiCare PatientID','StudyDateAnon']).agg({'const':'count'}).rename(columns = {'const':'num of mets'}).reset_index()
    filtered_mets = num_of_mets[num_of_mets['num of mets'].between(met_range[0],met_range[1])]
    filtered_df = filtered_mets.merge(filtered_df,on = ['PiCare PatientID','StudyDateAnon'],how='left').dropna()
    print(filtered_df[['PiCare PatientID']].drop_duplicates().shape)
    if return_list:
        return list(set([
                str(filtered_df.iloc[i,0])
                +'_'
                +''.join(str(filtered_df.iloc[i,1]).split(' ')[0].split('-'))
                for i in range(len(filtered_df))
        ]))
    else: 
        return filtered_df.rename(columns={'StudyDateAnon':'First Study Date'})


### Example

In [36]:
df = pd.read_excel('/data/public/MIM_BMETS_V6/3_final_datasets/manuscript_1_datasets/Master_BrainMets_List_Anon_June.xlsx')

In [37]:
df['StudyDateAnon'] = df['StudyDateAnon'].replace(to_replace=None, method='ffill')

In [71]:
selected_df = filter_records(
    df,
    treatment_range = [1,100000],
    date_range = ['1700-01-01','2020-03-01'],
    primary_use_list = list(df['Primary tumor Site'].unique()),
    met_range = [1,1000000],
    age_range=[1,100],
    return_list = False,
    min_volume = 0.,
    max_volume = 1000000
    )

(1866, 1)
(1866, 1)
(1790, 1)
(1790, 1)
(1730, 1)
(1730, 1)


In [73]:
selected_df['num of mets']==2

0       False
1       False
2       False
3       False
4       False
        ...  
6094    False
6095    False
6096    False
6097    False
6098    False
Name: num of mets, Length: 6080, dtype: bool

## Check Existence

In [39]:
os.listdir(file_path)

['1met_dont_include.xlsx',
 '3mets-500',
 '11+mets-500',
 'first_tx_allmets_0-0.5cc',
 'first_tx_4-5mets',
 'first_tx_2mets',
 '6-10mets_dont_include.xlsx',
 'first_tx_3mets',
 'first_tx_6-10mets',
 '6-10mets-500',
 '4-5mets_dont_include.xlsx',
 'Master_BrainMets_List_Anon_June.xlsx',
 '4-5mets-500',
 '3mets_dont_include.xlsx',
 'first_tx_allmets',
 'first_tx_1met',
 'first_tx_11+mets',
 '11+mets_dont_include.xlsx',
 '1met-500',
 '2mets_dont_include.xlsx',
 '2mets-500']

In [38]:
file_path = Path('/data/public/MIM_BMETS_V6/3_final_datasets/manuscript_1_datasets/')

In [7]:
# def filter_image_mask_existence(file_path,name_list_from_excel):
    
#     """
    
#     This function is for checking existence of real images based on the Patient_ID list created by 
#     filter_records function above
    
#     """
    
#     namelists = [path[2] for path in list(os.walk(file_path))[1:]]
#     namesets = []
#     for namelist in namelists:
#         namesets.append(set(['_'.join(name.split('_')[:2]) for name in namelist]))
    
#     #brain_masks_set, mets_masks, images, skll_stripped = namesets
#     existing_name_list = functools.reduce(lambda x,y:x.intersection(y),namesets)
#     result = [name for name in name_list_from_excel if name in existing_name_list]
#     return result

In [65]:
# def filter_image_mask_existence_for_certain_volume(file_path,name_list_from_excel):
    
#     """
    
#     This function is for checking existence of real images based on the Patient_ID list created by 
#     filter_records function above
    
#     """
#     sub_folders= ['first_tx_1met','first_tx_2mets','first_tx_3mets','first_tx_4-5mets','first_tx_6-10mets','first_tx_11+mets']
#     namelists = []
#     for sub_folder in sub_folders:
#         namelists += [path[2] for path in list(os.walk(file_path/sub_folder))[1:] if len(path[2])>0]
#     namelist = functools.reduce(lambda x,y:x+y,namelists)
#     nameset = set(namelist)
#     print(len(nameset))
#     #brain_masks_set, mets_masks, images, skll_stripped = namesets
#     existing_name_list = list(nameset)
#     result = [name for name in name_list_from_excel if name+'.npy' in existing_name_list]
#     return result

### Example

In [49]:
# file_path = Path('/data/public/MIM_BMETS_V6/3_final_datasets/manuscript_1_datasets')
# # sub_sets = ['training','validation','testing']
# sub_sets = ['training']
# total_num=0
# existing_file = []
# for sub_folder in ['first_tx_allmets']:
#     for sub_set in sub_sets:
#         total_num+=len(os.listdir(file_path/sub_folder/sub_set/'skull_stripped_1x1x3'))
#         existing_file+=[str(file_path/sub_folder/sub_set/'skull_stripped_1x1x3')+'/'+name for name in os.listdir(file_path/sub_folder/sub_set/'skull_stripped_1x1x3')]
#         print(len(os.listdir(file_path/sub_folder/sub_set/'skull_stripped_1x1x3')))

477


In [50]:
sizes = []
existing_file_list_size = []

In [51]:
for size, (min_volume,max_volume) in enumerate([(0.,0.499),(0.5,0.999),(1.,2.999),(3.,4.999),(5.,9.999),(10.,10000.)]):
    selected_df = filter_records(
        df,
        treatment_range = [1,100000],
        date_range = ['1700-01-01','2020-03-01'],
        primary_use_list = list(df['Primary tumor Site'].unique()),
        met_range = [1,1000000],
        age_range=[1,100],
        return_list = True,
        min_volume = min_volume,
        max_volume = max_volume
        )
    file_path = Path('/data/public/MIM_BMETS_V6/3_final_datasets/manuscript_1_datasets')
    sub_sets = ['training']
    total_num=0
    existing_file = []
    for sub_folder in ['first_tx_allmets']:
        for sub_set in sub_sets:
            total_num+=len(os.listdir(file_path/sub_folder/sub_set/'skull_stripped_1x1x3'))
            existing_file+=[str(file_path/sub_folder/sub_set/'skull_stripped_1x1x3')+'/'+name for name in os.listdir(file_path/sub_folder/sub_set/'skull_stripped_1x1x3')]
            print(len(os.listdir(file_path/sub_folder/sub_set/'skull_stripped_1x1x3')))
    existing_file_list_size += [i for i in existing_file if i.split('/')[-1] in [j+'.npy' for j in selected_df]]
    sizes += [size for i in range(len([i for i in existing_file if i.split('/')[-1] in [j+'.npy' for j in selected_df]]))]

(1866, 1)
(1866, 1)
(310, 1)
(310, 1)
(294, 1)
(294, 1)
477
(1866, 1)
(1866, 1)
(187, 1)
(187, 1)
(180, 1)
(180, 1)
477
(1866, 1)
(1866, 1)
(424, 1)
(424, 1)
(413, 1)
(413, 1)
477
(1866, 1)
(1866, 1)
(293, 1)
(293, 1)
(285, 1)
(285, 1)
477
(1866, 1)
(1866, 1)
(354, 1)
(354, 1)
(340, 1)
(340, 1)
477
(1866, 1)
(1866, 1)
(222, 1)
(222, 1)
(218, 1)
(218, 1)
477


In [54]:
len(existing_file_list_size)

460

In [55]:
len(sizes)

460

In [56]:
np.unique(sizes,return_counts=True)

(array([0, 1, 2, 3, 4, 5]), array([ 45,  46, 116,  89, 103,  61]))

In [65]:
nums = []
existing_file_list_num = []

In [66]:
for num, (min_num,max_num) in enumerate([(0.5,1.5),(1.5,2.5),(2.5,3.5),(3.5,5.5),(5.5,10.5),(10.5,10000)]):
    selected_df = filter_records(
        df,
        treatment_range = [1,100000],
        date_range = ['1700-01-01','2020-03-01'],
        primary_use_list = list(df['Primary tumor Site'].unique()),
        met_range = [min_num,max_num],
        age_range=[1,100],
        return_list = True,
        min_volume = -1,
        max_volume = 1000000
        )
    file_path = Path('/data/public/MIM_BMETS_V6/3_final_datasets/manuscript_1_datasets')
    sub_sets = ['training']
    total_num=0
    existing_file = []
    for sub_folder in ['first_tx_allmets']:
        for sub_set in sub_sets:
            total_num+=len(os.listdir(file_path/sub_folder/sub_set/'skull_stripped_1x1x3'))
            existing_file+=[str(file_path/sub_folder/sub_set/'skull_stripped_1x1x3')+'/'+name for name in os.listdir(file_path/sub_folder/sub_set/'skull_stripped_1x1x3')]
            print(len(os.listdir(file_path/sub_folder/sub_set/'skull_stripped_1x1x3')))
    existing_file_list += [i for i in existing_file if i.split('/')[-1] in [j+'.npy' for j in selected_df]]
    nums += [num for i in range(len([i for i in existing_file if i.split('/')[-1] in [j+'.npy' for j in selected_df]]))]

(1866, 1)
(1866, 1)
(1790, 1)
(1790, 1)
(1730, 1)
(497, 1)
477
(1866, 1)
(1866, 1)
(1790, 1)
(1790, 1)
(1730, 1)
(2, 1)
477
(1866, 1)
(1866, 1)
(1790, 1)
(1790, 1)
(1730, 1)
(0, 1)
477
(1866, 1)
(1866, 1)
(1790, 1)
(1790, 1)
(1730, 1)
(344, 1)
477
(1866, 1)
(1866, 1)
(1790, 1)
(1790, 1)
(1730, 1)
(244, 1)
477
(1866, 1)
(1866, 1)
(1790, 1)
(1790, 1)
(1730, 1)
(643, 1)
477


In [67]:
np.unique(nums,return_counts=True)

(array([0, 3, 4, 5]), array([118,  97,  57, 188]))

In [56]:
# df[df['PiCare PatientID'] == 'BrainMets-UCSF-00147']

In [57]:
# existing_name_list = filter_image_mask_existence_for_certain_volume(file_path,selected_df)

In [58]:
# len(existing_name_list)

In [59]:
# existing_name_list2 = filter_image_mask_existence_for_certain_volume(file_path,selected_df2)

In [60]:
# len(existing_name_list2)

## Create Subsets

In [141]:
def creating_folders(target_folder):
    """
    
    Spliting the data filtered above into train, validation and test sets.
    Creating a folder to store this subset.
    
    """
    main_path = Path('/home/chens/practicum/MEDomicsLab-develop-brainmets/image_processing/manuscript_1_datasets')
    target_folder = main_path/target_folder
    target_folder.mkdir()
    for name in ['training','validation','testing']:
        (target_folder/name).mkdir()
        for filetype in ['brain_masks_1x1x3','images_1x1x3','mets_masks_1x1x3','skull_stripped_1x1x3']:
            (target_folder/name/filetype).mkdir()

In [142]:
def data_spliting(filelist,train_size,validation_size):
    train,validation_test = train_test_split(filelist,train_size=train_size,shuffle=True)
    validation,test = train_test_split(validation_test,train_size=validation_size/(1-train_size),shuffle=True)
#     print(validation_size/(1-train_size))
    return train,validation,test

In [143]:
def saving_subsets(target_folder,train,validation,test):
    original_path = Path('/data/public/MIM_BMETS_V6/3_final_datasets/manuscript_1_datasets/first_tx_allmets')
    main_path = Path('/home/chens/practicum/MEDomicsLab-develop-brainmets/image_processing/manuscript_1_datasets')
    target_folder = main_path/target_folder
#     for name in ['training','validation','testing']:
    for filetype in ['brain_masks_1x1x3','images_1x1x3','mets_masks_1x1x3','skull_stripped_1x1x3']:
        for file in tqdm(existing_file_list):
            if file in train:
                copyfile(file.replace('skull_stripped_1x1x3',filetype),target_folder/'training'/filetype/file.split('/')[-1])
            elif file in validation:
                copyfile(file.replace('skull_stripped_1x1x3',filetype),target_folder/'validation'/filetype/file.split('/')[-1])
            elif file in test:
                copyfile(file.replace('skull_stripped_1x1x3',filetype),target_folder/'testing'/filetype/file.split('/')[-1])

### Example

In [144]:
target_folder='first_tx_allmets_0-0.5cc'

In [145]:
creating_folders(target_folder)

In [146]:
train,validation,test = data_spliting(existing_file_list,0.7,0.15)

In [147]:
print(len(existing_file_list))
print(len(train))
print(len(validation))
print(len(test))

73
51
10
12


In [148]:
saving_subsets(target_folder,train,validation,test)

100%|██████████| 73/73 [00:02<00:00, 26.97it/s]
100%|██████████| 73/73 [00:02<00:00, 27.80it/s]
100%|██████████| 73/73 [00:02<00:00, 28.99it/s]
100%|██████████| 73/73 [00:02<00:00, 28.91it/s]


In [53]:
80+65+186+117+144+81

448