Here we are going to add MRI column with paths to MRI files to the dataset.csv file. Merge will use only submitter_id column. This column will be checked to through out test group during pre-training of MRI encoder.

In [1]:
# import needed libraries
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold
pd.set_option('display.max_columns', None)

base_path = '/data/BraTS_2023'
dataset_file_path = '../src/data/dataset.csv'

In [2]:
dataframe = pd.read_csv(dataset_file_path)
dataframe.shape

(1104, 29)

In [3]:
dataframe.columns

Index(['Unnamed: 0', 'submitter_id', 'tumor_stage', 'age_at_diagnosis',
       'prior_treatment', 'prior_malignancy', 'synchronous_malignancy',
       'days_to_last_follow_up', 'gender', 'race', 'vital_status',
       'days_to_death', 'treatments_pharmaceutical_treatment_or_therapy',
       'treatments_radiation_treatment_or_therapy', 'project_id', 'time',
       'event', 'group', 'IDH', 'MGMT', 'X1p19q', '19.20.gain', '7g10l',
       'TERT', 'ATRX', 'DNAm', 'RNA', 'splits', 'MRI'],
      dtype='object')

In [4]:
dataframe.sample(5)

Unnamed: 0.1,Unnamed: 0,submitter_id,tumor_stage,age_at_diagnosis,prior_treatment,prior_malignancy,synchronous_malignancy,days_to_last_follow_up,gender,race,vital_status,days_to_death,treatments_pharmaceutical_treatment_or_therapy,treatments_radiation_treatment_or_therapy,project_id,time,event,group,IDH,MGMT,X1p19q,19.20.gain,7g10l,TERT,ATRX,DNAm,RNA,splits,MRI
123,123,TCGA-DU-A7TG,,14671.0,No,no,No,,male,white,Dead,1351.0,yes,no,LGG,3.70137,1,train,Mutant,Methylated,non-codel,No chr 19/20 gain,No combined CNA,Not expressed,WT,,../../DRIM/data/TCGA/GBMLGG/RNA/837fbf69-20e7-...,2.0,
16,16,TCGA-DB-5270,,13904.0,No,no,No,3733.0,female,white,Alive,,no,no,LGG,10.227397,0,train,Mutant,Methylated,non-codel,No chr 19/20 gain,No combined CNA,Not expressed,Mutant,,../../DRIM/data/TCGA/GBMLGG/RNA/2c7ecb00-2262-...,0.0,
15,15,TCGA-CS-6670,,16066.0,Yes,yes,,1426.0,male,white,Alive,,yes,,LGG,3.906849,0,train,Mutant,Methylated,codel,No chr 19/20 gain,No combined CNA,Expressed,WT,,../../DRIM/data/TCGA/GBMLGG/RNA/8e7557fc-63da-...,4.0,
81,81,TCGA-DU-7006,,21990.0,No,no,No,,female,white,Dead,349.0,yes,yes,LGG,0.956164,1,train,WT,Methylated,non-codel,No chr 19/20 gain,Gain chr 7 & loss chr 10,Expressed,WT,,../../DRIM/data/TCGA/GBMLGG/RNA/9910b4c2-f121-...,3.0,
847,847,TCGA-32-2634,,29964.0,No,,,693.0,male,white,Alive,,yes,yes,GBM,1.89863,0,train,WT,Methylated,non-codel,No chr 19/20 gain,No combined CNA,Expressed,WT,,../../DRIM/data/TCGA/GBMLGG/RNA/dfbc7136-f7f7-...,1.0,


In [5]:
# take only samples with brain as primary site
dataframe = dataframe[dataframe.project_id.isin(['GBM', 'LGG'])]
dataframe.shape

(1104, 29)

In [6]:
dataframe['group'].value_counts()

group
train    886
test     218
Name: count, dtype: int64

In [7]:
dataframe['splits'].value_counts()

splits
0.0    178
1.0    177
4.0    177
3.0    177
2.0    177
Name: count, dtype: int64

## MRI
For this part, we are using the overlap with patients from the BraTS and TCGA competitions, using the mapping file available on the competition website. Since the BraTS competition contains over a thousand cases and only 165 from the TCGA, we will first of all pre-train the models.

In [8]:
mapping_file = '../src/data/BraTS2023_2017_GLI_Mapping.xlsx'
mapping_mri = pd.read_excel(mapping_file)
print(mapping_mri.shape)
mapping_mri.sample(2)

(1255, 9)


Unnamed: 0,BraTS2023,BraTS2021,BraTS2020,BraTS2019,BraTS2018,BraTS2017,Cohort Name (if publicly available),Site No (represents the originating institution),Local ID
696,BraTS-GLI-01112-000,BraTS2021_01112,,,,,Private Collection,1,
903,BraTS-GLI-01319-000,BraTS2021_01319,,,,,Private Collection,1,


### pre-train

In [12]:
dataframe[~dataframe['MRI'].isna()]['splits'].value_counts()

splits
0.0    28
1.0    27
3.0    25
4.0    24
2.0    22
Name: count, dtype: int64

In [30]:
pretrain_mri = pd.merge(
    how='left',
    left=mapping_mri,
    right=dataframe[['submitter_id', 'group', 'splits']],
    left_on='Local ID ',
    right_on='submitter_id'
)
pretrain_mri.dropna(subset=['BraTS2023'], inplace=True)
print(f"Total patients: {len(pretrain_mri)}")

pretrain_mri['MRI'] = pretrain_mri['BraTS2023'].apply(lambda x: os.path.join(base_path, 'MRI', x))
pretrain_mri.fillna(value={'group': 'train'}, inplace=True)

def check_modalities(patients, modalities):
    '''
    get patients only with target modalities
    '''
    patients_with_needed_modalities = []
    needed_modalities = set(modalities)
    needed_modalities.add("seg") #segmentation mask used to compute center of tumor
    for patient in patients:
        available_modalities = set([x.split("-")[-1].split(".")[0] for x in os.listdir(patient)])
        if needed_modalities.intersection(available_modalities) == needed_modalities:
            patients_with_needed_modalities.append(patient)
    print(f"Patients with all needed modalities: {len(patients_with_needed_modalities)}")
    return patients_with_needed_modalities

modalities = ['t1c', 't1n', 't2w', 't2f']
patients_with_needed_modalities = check_modalities(pretrain_mri['MRI'].values, modalities)
pretrain_mri = pretrain_mri[pretrain_mri['MRI'].isin(patients_with_needed_modalities)]

#validation splits
pretrain_mri_train = pretrain_mri[pretrain_mri['group'] == 'train']
pretrain_mri_test = pretrain_mri[pretrain_mri['group'] == 'test']

pretrain_mri_train_with_splits = pretrain_mri_train[~pretrain_mri_train['splits'].isna()]
pretrain_mri_train_without_splits = pretrain_mri_train[pretrain_mri_train['splits'].isna()]

#proportional splits generation
ready_splits_distribution = pretrain_mri_train_with_splits['splits'].value_counts()
N = len(pretrain_mri_train) #total
k = len(ready_splits_distribution) #number of classes
additional_splits_distribution = ready_splits_distribution.apply(lambda x: (N//k - x)).sort_values()
additional_splits_distribution.iloc[:N%k] += 1
splits = []
for idxs in [[k] * v for k, v in additional_splits_distribution.items()]:
    splits.extend(idxs)
rng = np.random.default_rng()
rng.shuffle(splits) #inplace

# pretrain_mri_train_without_splits.drop('splits', axis="columns", inplace=True)
pretrain_mri_train_without_splits['splits'] = splits
pretrain_mri_test['splits'] = -1
pretrain_mri = pd.concat([pretrain_mri_train_without_splits, pretrain_mri_train_with_splits, pretrain_mri_test])

print(pretrain_mri['group'].value_counts())
print("ready splits", pretrain_mri_train_with_splits['splits'].value_counts().sort_index())
print("generated splits", pretrain_mri_train_without_splits['splits'].value_counts().sort_index())
print("total splits", pretrain_mri['splits'].value_counts().sort_index())

pretrain_mri = pretrain_mri[['submitter_id', 'group', 'splits', 'MRI']]
display(pretrain_mri.head(5))

pretrain_mri.to_csv(f'../src/data/pretrain_mri_{"_".join(modalities)}.csv', index=False)

Total patients: 1251
Patients with all needed modalities: 1251
group
train    1212
test       39
Name: count, dtype: int64
ready splits splits
0.0    28
1.0    27
2.0    22
3.0    25
4.0    24
Name: count, dtype: int64
generated splits splits
0.0    215
1.0    216
2.0    220
3.0    217
4.0    218
Name: count, dtype: int64
total splits splits
-1.0     39
 0.0    243
 1.0    243
 2.0    242
 3.0    242
 4.0    242
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pretrain_mri_train_without_splits['splits'] = splits
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pretrain_mri_test['splits'] = -1


Unnamed: 0,submitter_id,group,splits,MRI
0,,train,3.0,/data/BraTS_2023/MRI/BraTS-GLI-00000-000
1,,train,4.0,/data/BraTS_2023/MRI/BraTS-GLI-00002-000
2,,train,0.0,/data/BraTS_2023/MRI/BraTS-GLI-00003-000
3,,train,3.0,/data/BraTS_2023/MRI/BraTS-GLI-00005-000
4,,train,0.0,/data/BraTS_2023/MRI/BraTS-GLI-00006-000


### survival prediction task

In [27]:
mapping_mri = mapping_mri[mapping_mri['Cohort Name (if publicly available)'].str.contains('TCGA')]
print(mapping_mri.shape)
mapping_mri.sample(2)

(167, 9)


Unnamed: 0,BraTS2023,BraTS2021,BraTS2020,BraTS2019,BraTS2018,BraTS2017,Cohort Name (if publicly available),Site No (represents the originating institution),Local ID
84,BraTS-GLI-00120-000,BraTS2021_00120,BraTS20_Training_187,BraTS19_TCIA02_430_1,Brats18_TCIA02_430_1,Brats17_TCIA_430_1,TCGA-GBM,6,TCGA-06-0145
1104,BraTS-GLI-01520-000,BraTS2021_01520,BraTS20_Training_315,BraTS19_TCIA10_632_1,Brats18_TCIA10_632_1,Brats17_TCIA_632_1,TCGA-LGG,13,TCGA-DU-A6S7


In [28]:
mapping_mri = mapping_mri[["Local ID ", "BraTS2023"]]
mapping_mri['MRI'] = mapping_mri['BraTS2023'].apply(lambda x: os.path.join(base_path, 'MRI', x))
mapping_mri.drop(columns=['BraTS2023'], inplace=True)
mapping_mri.rename(columns={'Local ID ': 'submitter_id'}, inplace=True)

In [29]:
mapping_mri.sample(5)

Unnamed: 0,submitter_id,MRI
1071,TCGA-DU-5851,/data/BraTS_2023/MRI/BraTS-GLI-01487-000
96,TCGA-14-1456,/data/BraTS_2023/MRI/BraTS-GLI-00134-000
108,TCGA-02-0009,/data/BraTS_2023/MRI/BraTS-GLI-00149-000
82,TCGA-19-1789,/data/BraTS_2023/MRI/BraTS-GLI-00117-000
1027,TCGA-06-0179,/data/BraTS_2023/MRI/BraTS-GLI-01443-000


In [30]:
dataframe = pd.merge(
    how='left',
    left=dataframe,
    right=mapping_mri, 
    on='submitter_id'
)

In [31]:
dataframe.sample(2)

Unnamed: 0,submitter_id,tumor_stage,age_at_diagnosis,prior_treatment,prior_malignancy,synchronous_malignancy,days_to_last_follow_up,gender,race,vital_status,...,MGMT,X1p19q,19.20.gain,7g10l,TERT,ATRX,DNAm,RNA,splits,MRI
937,TCGA-HT-7691,,11465.0,No,no,No,3.0,female,white,Alive,...,Unmethylated,non-codel,No chr 19/20 gain,No combined CNA,Not expressed,WT,,../../DRIM/data/TCGA/GBMLGG/RNA/60353a97-9450-...,,
771,TCGA-19-5954,,26355.0,No,,,,female,white,Dead,...,Methylated,non-codel,No chr 19/20 gain,Gain chr 7 & loss chr 10,,WT,,,3.0,/data/BraTS_2023/MRI/BraTS-GLI-00138-000


In [36]:
dataframe.to_csv(dataset_file_path)

In [7]:
len(dataframe[(~dataframe['MRI'].isna()) & (dataframe['group'] == 'train')])

126

In [8]:
len(dataframe[(~dataframe['MRI'].isna()) & (dataframe['group'] == 'test')])

39