In [1]:
import pandas as pd
import numpy as np
from typing import Sequence
from pathlib import Path
import os
import functools 
from shutil import copyfile
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
PATH = Path('/home/chens/practicum/')

## 1. filter records

In [73]:
def filter_records(df,treatment_range=None,date_range=None,primary_use_list=None,age_range=None,met_range=None,return_list = False) -> Sequence[str]:
    
    """
    
    filter the medical records based on 
        1. the number of treatments
        2. first treatment date
        3. the primary use, age and number of mets on the first treatment
    
    return a list of filenames if the parameter 'return_list' is True, otherwise return a DataFrame
    
    """

    # Treatment count
    droped_duplicates_date = df[['PiCare PatientID','StudyDateAnon']].drop_duplicates().dropna()
    treat_count = droped_duplicates_date.groupby(['PiCare PatientID']).agg({'StudyDateAnon':'count'}).reset_index().rename(columns={'StudyDateAnon':'Treatment Count'})
    filtered_treatment_num = treat_count[treat_count['Treatment Count'].between(treatment_range[0],treatment_range[1])]
    filtered_df = filtered_treatment_num.merge(df,on='PiCare PatientID',how='left')

    # First Treatment Date
    first_treatment_date = filtered_df.groupby(['PiCare PatientID']).agg({'StudyDateAnon':'min'}).reset_index()
    filtered_date = first_treatment_date[first_treatment_date['StudyDateAnon'].between(date_range[0],date_range[1])]
    filtered_df = filtered_date.merge(filtered_df,on=['PiCare PatientID','StudyDateAnon'],how='left')[['PiCare PatientID',
                                                                                          'StudyDateAnon',
                                                                                          'Primary tumor Site',
                                                                                          'Age primary diag']].drop_duplicates().dropna()
    # Primary Use on First Treatment
    if primary_use_list[0] != 'all': filtered_df = filtered_df[filtered_df['Primary tumor Site'].isin(primary_use_list)]
    
    # Age on First Treatment 
    filtered_df = filtered_df[filtered_df['Age primary diag'].between(age_range[0],age_range[1])]
    
    # Number of mets on First Treatment
    num_of_mets = filtered_df.merge(df,on=['PiCare PatientID','StudyDateAnon','Primary tumor Site','Age primary diag'],how='left').assign(const=1).groupby(['PiCare PatientID','StudyDateAnon']).agg({'const':'count'}).rename(columns = {'const':'num of mets'}).reset_index()
    filtered_mets = num_of_mets[num_of_mets['num of mets'].between(met_range[0],met_range[1])]
    filtered_df = filtered_mets.merge(filtered_df,on = ['PiCare PatientID','StudyDateAnon'],how='left').dropna()
    if return_list:
        return [
                str(filtered_df.iloc[i,0])
                +'_'
                +''.join(str(filtered_df.iloc[i,1]).split(' ')[0].split('-'))
                +'.npy'
                for i in range(len(filtered_df))
        ]
    else: 
        return filtered_df[['PiCare PatientID','StudyDateAnon']].rename(columns={'StudyDateAnon':'First Study Date'})


### Example

In [74]:
!ls /home/chens/practicum/data/

Master_BrainMets_List_Anon_Sihan.xlsx


In [75]:
df = pd.read_excel('/home/nanot/data/Master_BrainMets_List_Anon_June.xlsx')

In [137]:
selected_df1 = filter_records(
    df,
    treatment_range = [1,1],
    date_range = ['1900-01-01','2020-01-01'],
    primary_use_list = ['all'],
    met_range = [1,100],
    age_range=[1,200],
    return_list = True
    )
selected_df2 = filter_records(
    df,
    treatment_range = [2,100],
    date_range = ['1900-01-01','2020-01-01'],
    primary_use_list = ['all'],
    met_range = [1,100],
    age_range=[1,200],
    return_list = True
    )
selected_df3 = filter_records(
    df,
    treatment_range = [3,100],
    date_range = ['1900-01-01','2020-01-01'],
    primary_use_list = ['all'],
    met_range = [1,100],
    age_range=[1,200],
    return_list = True
    )
selected_df4 = filter_records(
    df,
    treatment_range = [4,100],
    date_range = ['1900-01-01','2020-01-01'],
    primary_use_list = ['all'],
    met_range = [1,100],
    age_range=[1,200],
    return_list = True
    )
selected_df5 = filter_records(
    df,
    treatment_range = [5,100],
    date_range = ['1900-01-01','2020-01-01'],
    primary_use_list = ['all'],
    met_range = [1,100],
    age_range=[1,200],
    return_list = True
    )
selected_df6 = filter_records(
    df,
    treatment_range = [6,100],
    date_range = ['1900-01-01','2020-01-01'],
    primary_use_list = ['all'],
    met_range = [1,100],
    age_range=[1,200],
    return_list = True
    )
selected_df7 = filter_records(
    df,
    treatment_range = [7,100],
    date_range = ['1900-01-01','2020-01-01'],
    primary_use_list = ['all'],
    met_range = [1,100],
    age_range=[1,200],
    return_list = True
    )
selected_df8 = filter_records(
    df,
    treatment_range = [8,100],
    date_range = ['1900-01-01','2020-01-01'],
    primary_use_list = ['all'],
    met_range = [1,100],
    age_range=[1,200],
    return_list = True
    )
selected_df9 = filter_records(
    df,
    treatment_range = [9,100],
    date_range = ['1900-01-01','2020-01-01'],
    primary_use_list = ['all'],
    met_range = [1,100],
    age_range=[1,200],
    return_list = True
    )
selected_df10 = filter_records(
    df,
    treatment_range = [10,100],
    date_range = ['1900-01-01','2020-01-01'],
    primary_use_list = ['all'],
    met_range = [1,100],
    age_range=[1,200],
    return_list = True
    )


In [138]:
selected_df_all = selected_df1#selected_df2+selected_df3+selected_df4+selected_df5+selected_df6+selected_df7+selected_df8+selected_df9+selected_df10
len(selected_df_all)

1006

In [139]:
t = set(selected_df_all + (list(t1)))
len(t)

1321

## Check Existence

In [144]:
file_path = Path('/data/public/MIM_BMETS_V6/2_processed/')

In [145]:
def filter_image_mask_existence(file_path,name_list_from_excel):
    
    """
    
    This function is for checking existence of real images based on the Patient_ID list created by 
    filter_records function above
    
    """
    
    namelists = [path[2] for path in list(os.walk(file_path))[1:]]
    namesets = []
    for namelist in namelists:
        namesets.append(set(['_'.join(name.split('_')[:2]) for name in namelist]))
    
    print(len(namesets[3]))
    
    #brain_masks_set, mets_masks, images, skll_stripped = namesets
    existing_name_list = functools.reduce(lambda x,y:x.intersection(y),namesets)
    
    print(len(existing_name_list))
    
    result = [name for name in name_list_from_excel if name in existing_name_list]
    return result

### Example

In [146]:
existing_name_list = filter_image_mask_existence(file_path,selected_df_all)

2133
2133


In [147]:
len(existing_name_list)

993

In [148]:
len(set( existing_name_list))
existing_name_list = list(set(existing_name_list))

## Create Subsets

In [149]:
def creating_folders(target_folder):
    """
    
    Spliting the data filtered above into train, validation and test sets.
    Creating a folder to store this subset.
    
    """
    main_path = Path('/data/public/MIM_BMETS_V6/3_final_datasets/')
    target_folder = main_path/target_folder
    target_folder.mkdir()
    for name in ['training','validation','testing']:
        (target_folder/name).mkdir()
        for filetype in ['brain_masks','images','mets_masks','skull_stripped']:
            (target_folder/name/filetype).mkdir()

In [150]:
def data_spliting(namelist,train_size,validation_size):
    train,validation_test = train_test_split(existing_name_list,train_size=train_size,shuffle=True)
    validation,test = train_test_split(validation_test,train_size=validation_size/(1-train_size),shuffle=True)
    print(validation_size/(1-train_size))
    return train,validation,test

In [151]:
def saving_subsets(target_folder,train,validation,test):
    original_path = Path('/data/public/MIM_BMETS_V6/2_processed')
    main_path = Path('/data/public/MIM_BMETS_V6/3_final_datasets/')
    target_folder = main_path/target_folder
#     for name in ['training','validation','testing']:
    for filetype in ['brain_masks','images','mets_masks','skull_stripped']:
        for file in tqdm(os.listdir(original_path/filetype)):
            if file in train:
                copyfile(str(original_path/filetype/file),target_folder/'training'/filetype/file)
            elif file in validation:
                copyfile(str(original_path/filetype/file),target_folder/'validation'/filetype/file)
            elif file in test:
                copyfile(str(original_path/filetype/file),target_folder/'testing'/filetype/file)
#             if '_'.join(file.split('_')[:2]) in train:
#                 copyfile(str(original_path/filetype/file),target_folder/'training'/filetype/file)
#             elif '_'.join(file.split('_')[:2]) in validation:
#                 copyfile(str(original_path/filetype/file),target_folder/'validation'/filetype/file)
#             elif '_'.join(file.split('_')[:2]) in test:
#                 copyfile(str(original_path/filetype/file),target_folder/'testing'/filetype/file)

### Example

In [152]:
target_folder='first_treatments_1.5x1.5x3_256x256x64'

In [153]:
creating_folders(target_folder)

In [154]:
train,validation,test = data_spliting(existing_name_list,0.7,0.15)

0.4999999999999999


In [156]:
t1 = train + validation + test
print(len(set(t1)))

993


In [157]:
print(len(set(existing_name_list)))
print(len(set(train)))
print(len(set(validation)))
print(len(set(test)))

993
695
148
150


In [158]:
saving_subsets(target_folder,train,validation,test)



  0%|          | 0/2133 [00:00<?, ?it/s][A[A

  0%|          | 1/2133 [00:00<04:58,  7.15it/s][A[A

  0%|          | 3/2133 [00:00<04:10,  8.51it/s][A[A

  0%|          | 7/2133 [00:00<03:12, 11.04it/s][A[A

  0%|          | 10/2133 [00:00<02:39, 13.34it/s][A[A

  1%|          | 14/2133 [00:00<02:25, 14.60it/s][A[A

  1%|          | 19/2133 [00:00<01:56, 18.21it/s][A[A

  1%|          | 22/2133 [00:01<02:04, 16.97it/s][A[A

  1%|          | 25/2133 [00:01<02:22, 14.81it/s][A[A

  1%|▏         | 27/2133 [00:01<02:33, 13.68it/s][A[A

  1%|▏         | 30/2133 [00:01<02:16, 15.35it/s][A[A

  2%|▏         | 36/2133 [00:01<02:02, 17.09it/s][A[A

  2%|▏         | 38/2133 [00:02<02:25, 14.38it/s][A[A

  2%|▏         | 40/2133 [00:02<02:21, 14.81it/s][A[A

  2%|▏         | 42/2133 [00:02<02:45, 12.61it/s][A[A

  2%|▏         | 44/2133 [00:02<02:31, 13.81it/s][A[A

  2%|▏         | 46/2133 [00:02<02:53, 12.05it/s][A[A

  2%|▏         | 48/2133 [00:02<02:37, 13