This file includes the super-family probabilities from the v12.8 classifier used to filter the slides, hopefully resulting in an even larger training set. 

In [1]:
import numpy as np
import sklearn
import pandas as pd
import h5py

import matplotlib.pyplot as plt

from skimage.transform import resize, pyramid_gaussian
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

import zarr

import os
import sys
import re

import openslide

from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay



In [25]:
slide_meta = pd.read_csv("/home/p163v/histopathology/metadata/labels_with_new_batch_v12.8.csv")
ct_scoring = pd.read_csv("/home/p163v/histopathology/metadata/CT_3_Class_Draft.csv")



ct_scoring["txt_idat"] = ct_scoring["idat"].astype("str")
ct_scoring.index = ct_scoring.txt_idat
slide_meta.index = slide_meta.idat
ct_scoring = ct_scoring.drop("txt_idat", axis=1)
slide_meta = slide_meta.drop("idat", axis=1)
slide_annots = slide_meta.join(ct_scoring, lsuffix="l")


myx = [x in ["Chromothripsis", "No Chromothripsis"] for x in slide_annots.CT_class]

slide_annots = slide_annots.loc[myx]
slide_names = slide_annots.uuid

# slide_names
slide_annots.CT_class


oncotree_map = pd.read_csv("/home/p163v/histopathology/metadata/MappingClassifierToOncotree.csv")

slide_annots = slide_annots.merge(oncotree_map, left_on="max_super_family_class", right_on="Super Family", how="left")

slide_annots = slide_annots.loc[slide_annots['Oncotree code'].isin(['EMBT'])]
slide_annots = slide_annots.loc[slide_annots.maxscore_superfamily * 100 >= 80]

slide_annots

Unnamed: 0.1,Unnamed: 0l,uuid,tumor_id,txt_LOKALISATION,num_ALTERSANGABE,patient_id,max_super_family_class,max_family_class,max_class,max_subclass,...,prediction_class,maxscore_class,prediction_subclass,maxscore_subclass,RF_ABSOLUTE_purity,Unnamed: 0,idat,CT_class,Super Family,Oncotree code
3,9,BCEBE5E3-E482-4E27-82FD-A2FFF12F5C84,74034,"infratentoriell, hintere Schädelgrube",14.0,5457,Medulloblastoma,"medulloblastoma, WNT activated","medulloblastoma, WNT activated","Medulloblastoma, WNT activated",...,"medulloblastoma, WNT activated",0.999978,"Medulloblastoma, WNT activated",0.999978,0.626585,64.0,10003886258_R02C01,No Chromothripsis,Medulloblastoma,EMBT
20,76,AC5395D3-AD97-4A41-BA89-6323559283A1,83016,"infratentoriell, hintere Schädelgrube / KH",9.0,36927,Medulloblastoma,medulloblastoma non-WNT/non-SHH activated,medulloblastoma Group 4,"Medulloblastoma Group 4, subclass VIII",...,medulloblastoma Group 4,0.999973,"Medulloblastoma Group 4, subclass VIII",0.999961,0.600059,988.0,101178130116_R05C02,No Chromothripsis,Medulloblastoma,EMBT
37,115,381DF416-8A35-46CA-8B6D-EA194311B86E,82616,infratentoriell,52.0,36711,Medulloblastoma,"medulloblastoma, SHH-activated","medulloblastoma, SHH-activated, subtype 4","Medulloblastoma, SHH-activated, subtype 4",...,"medulloblastoma, SHH-activated, subtype 4",0.809931,"Medulloblastoma, SHH-activated, subtype 4",0.809931,0.611165,1183.0,101231000030_R05C02,No Chromothripsis,Medulloblastoma,EMBT
66,179,CB4E62D2-8ED5-444F-8FCE-23E550D97762,83340,"infratentoriell, posterior fossa",17.0,37018,Medulloblastoma,medulloblastoma non-WNT/non-SHH activated,medulloblastoma Group 4,"Medulloblastoma Group 4, subclass VIII",...,medulloblastoma Group 4,0.999807,"Medulloblastoma Group 4, subclass VIII",0.995144,0.622215,2155.0,200091640033_R04C02,No Chromothripsis,Medulloblastoma,EMBT
79,200,049207AA-0F55-41B0-83B7-08A693C2913D,83474,"infratentoriell, linke Kleinhirnhemisphäre",2.0,37046,Medulloblastoma,"medulloblastoma, SHH-activated","medulloblastoma, SHH-activated, subtype 1","Medulloblastoma, SHH-activated, subtype 1",...,"medulloblastoma, SHH-activated, subtype 1",0.996558,"Medulloblastoma, SHH-activated, subtype 1",0.996558,0.645687,2202.0,200091640040_R01C02,No Chromothripsis,Medulloblastoma,EMBT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2131,6088,E4D6DCC0-EDD1-4DEB-BD94-374DE64592D1,69662,"infratentoriell, IV Ventrikel",17.0,3805,Medulloblastoma,"medulloblastoma, WNT activated","medulloblastoma, WNT activated","Medulloblastoma, WNT activated",...,"medulloblastoma, WNT activated",0.997160,"Medulloblastoma, WNT activated",0.997160,0.613381,123842.0,9968646167_R05C01,No Chromothripsis,Medulloblastoma,EMBT
2132,6089,997BECCB-FD4F-42BF-BA31-DE1E35F95BAE,69664,"infratentoriell, Cerebellär links",38.0,3806,Medulloblastoma,"medulloblastoma, SHH-activated","medulloblastoma, SHH-activated, subtype 4","Medulloblastoma, SHH-activated, subtype 4",...,"medulloblastoma, SHH-activated, subtype 4",0.955011,"Medulloblastoma, SHH-activated, subtype 4",0.955011,0.660877,123844.0,9968646167_R06C01,No Chromothripsis,Medulloblastoma,EMBT
2163,6173,3DA60281-C3A8-481C-AB58-B3F18E60CDB0,70610,"infratentoriell, Kleinhirn",4.0,4218,Medulloblastoma,medulloblastoma non-WNT/non-SHH activated,medulloblastoma Group 4,"Medulloblastoma Group 4, subclass VI",...,medulloblastoma Group 4,0.996998,"Medulloblastoma Group 4, subclass VI",0.857598,0.539347,124235.0,9969477088_R02C01,No Chromothripsis,Medulloblastoma,EMBT
2164,6174,EB7DCFE3-8E7F-4FF2-9CF9-C1AC9E300CC2,70610,"infratentoriell, Kleinhirn",4.0,4218,Medulloblastoma,medulloblastoma non-WNT/non-SHH activated,medulloblastoma Group 4,"Medulloblastoma Group 4, subclass VI",...,medulloblastoma Group 4,0.996998,"Medulloblastoma Group 4, subclass VI",0.857598,0.539347,124235.0,9969477088_R02C01,No Chromothripsis,Medulloblastoma,EMBT


In [26]:
slide_annots.idat.__len__()

102

In [27]:
slide_annots.loc[(slide_annots.max_super_family_class != 'Medulloblastoma')]

Unnamed: 0.1,Unnamed: 0l,uuid,tumor_id,txt_LOKALISATION,num_ALTERSANGABE,patient_id,max_super_family_class,max_family_class,max_class,max_subclass,...,prediction_class,maxscore_class,prediction_subclass,maxscore_subclass,RF_ABSOLUTE_purity,Unnamed: 0,idat,CT_class,Super Family,Oncotree code
1613,5013,EDA11A46-8944-4488-B491-5A20A937BC92,59474,"supratentorial, frontobasal",59.0,533,esthesioneuroblastoma,esthesioneuroblastoma,esthesioneuroblastoma,Olfactory neuroblastoma,...,esthesioneuroblastoma,0.999905,Olfactory neuroblastoma,0.999905,0.583036,112300.0,9007225064_R02C02,No Chromothripsis,esthesioneuroblastoma,EMBT
1815,5332,CC4A1A65-77FA-47B5-9D78-8FDA9A72650A,65268,lymph node left cervical,58.0,675,esthesioneuroblastoma,esthesioneuroblastoma,esthesioneuroblastoma,Olfactory neuroblastoma,...,esthesioneuroblastoma,0.999969,Olfactory neuroblastoma,0.999969,0.49935,117294.0,9422491056_R03C02,No Chromothripsis,esthesioneuroblastoma,EMBT
1859,5429,81E2F754-8C98-4753-81C8-05413E641EBD,65078,"skullbase, nasal cavity",44.0,672,esthesioneuroblastoma,esthesioneuroblastoma,esthesioneuroblastoma,Olfactory neuroblastoma,...,esthesioneuroblastoma,0.988144,Olfactory neuroblastoma,0.988144,0.483926,118393.0,9444375028_R05C01,No Chromothripsis,esthesioneuroblastoma,EMBT
1860,5430,E32E72E1-2074-405C-9678-DBB3CB73B14B,65078,"skullbase, nasal cavity",44.0,672,esthesioneuroblastoma,esthesioneuroblastoma,esthesioneuroblastoma,Olfactory neuroblastoma,...,esthesioneuroblastoma,0.988144,Olfactory neuroblastoma,0.988144,0.483926,118393.0,9444375028_R05C01,No Chromothripsis,esthesioneuroblastoma,EMBT
1861,5431,E56502BF-A0F0-4DFF-9A79-ED1EA6CEF9FF,65078,"skullbase, nasal cavity",44.0,672,esthesioneuroblastoma,esthesioneuroblastoma,esthesioneuroblastoma,Olfactory neuroblastoma,...,esthesioneuroblastoma,0.988144,Olfactory neuroblastoma,0.988144,0.483926,118393.0,9444375028_R05C01,No Chromothripsis,esthesioneuroblastoma,EMBT
2013,5802,3950B489-D180-406B-B042-6765DCC99878,68262,left nasal biopsy,70.0,2551,esthesioneuroblastoma,esthesioneuroblastoma,esthesioneuroblastoma,Olfactory neuroblastoma,...,esthesioneuroblastoma,0.999828,Olfactory neuroblastoma,0.999828,0.510897,121267.0,9741950120_R03C02,Chromothripsis,esthesioneuroblastoma,EMBT


In [28]:
path_to_extracted_features = '/home/p163v/histopathology/UKHD_Neuro/RetCLL_Features/'


patch_num = []
for x in slide_annots.uuid:
    fl = path_to_extracted_features+x+".h5"
    if(os.path.exists(fl)):
        patch_num.append(h5py.File(fl)['feats'].shape[0])
    else:
        patch_num.append(0)

In [29]:
slide_annots['patches'] = patch_num

In [30]:
slide_annots

Unnamed: 0.1,Unnamed: 0l,uuid,tumor_id,txt_LOKALISATION,num_ALTERSANGABE,patient_id,max_super_family_class,max_family_class,max_class,max_subclass,...,maxscore_class,prediction_subclass,maxscore_subclass,RF_ABSOLUTE_purity,Unnamed: 0,idat,CT_class,Super Family,Oncotree code,patches
3,9,BCEBE5E3-E482-4E27-82FD-A2FFF12F5C84,74034,"infratentoriell, hintere Schädelgrube",14.0,5457,Medulloblastoma,"medulloblastoma, WNT activated","medulloblastoma, WNT activated","Medulloblastoma, WNT activated",...,0.999978,"Medulloblastoma, WNT activated",0.999978,0.626585,64.0,10003886258_R02C01,No Chromothripsis,Medulloblastoma,EMBT,1381
20,76,AC5395D3-AD97-4A41-BA89-6323559283A1,83016,"infratentoriell, hintere Schädelgrube / KH",9.0,36927,Medulloblastoma,medulloblastoma non-WNT/non-SHH activated,medulloblastoma Group 4,"Medulloblastoma Group 4, subclass VIII",...,0.999973,"Medulloblastoma Group 4, subclass VIII",0.999961,0.600059,988.0,101178130116_R05C02,No Chromothripsis,Medulloblastoma,EMBT,6674
37,115,381DF416-8A35-46CA-8B6D-EA194311B86E,82616,infratentoriell,52.0,36711,Medulloblastoma,"medulloblastoma, SHH-activated","medulloblastoma, SHH-activated, subtype 4","Medulloblastoma, SHH-activated, subtype 4",...,0.809931,"Medulloblastoma, SHH-activated, subtype 4",0.809931,0.611165,1183.0,101231000030_R05C02,No Chromothripsis,Medulloblastoma,EMBT,4076
66,179,CB4E62D2-8ED5-444F-8FCE-23E550D97762,83340,"infratentoriell, posterior fossa",17.0,37018,Medulloblastoma,medulloblastoma non-WNT/non-SHH activated,medulloblastoma Group 4,"Medulloblastoma Group 4, subclass VIII",...,0.999807,"Medulloblastoma Group 4, subclass VIII",0.995144,0.622215,2155.0,200091640033_R04C02,No Chromothripsis,Medulloblastoma,EMBT,3231
79,200,049207AA-0F55-41B0-83B7-08A693C2913D,83474,"infratentoriell, linke Kleinhirnhemisphäre",2.0,37046,Medulloblastoma,"medulloblastoma, SHH-activated","medulloblastoma, SHH-activated, subtype 1","Medulloblastoma, SHH-activated, subtype 1",...,0.996558,"Medulloblastoma, SHH-activated, subtype 1",0.996558,0.645687,2202.0,200091640040_R01C02,No Chromothripsis,Medulloblastoma,EMBT,7877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2131,6088,E4D6DCC0-EDD1-4DEB-BD94-374DE64592D1,69662,"infratentoriell, IV Ventrikel",17.0,3805,Medulloblastoma,"medulloblastoma, WNT activated","medulloblastoma, WNT activated","Medulloblastoma, WNT activated",...,0.997160,"Medulloblastoma, WNT activated",0.997160,0.613381,123842.0,9968646167_R05C01,No Chromothripsis,Medulloblastoma,EMBT,5114
2132,6089,997BECCB-FD4F-42BF-BA31-DE1E35F95BAE,69664,"infratentoriell, Cerebellär links",38.0,3806,Medulloblastoma,"medulloblastoma, SHH-activated","medulloblastoma, SHH-activated, subtype 4","Medulloblastoma, SHH-activated, subtype 4",...,0.955011,"Medulloblastoma, SHH-activated, subtype 4",0.955011,0.660877,123844.0,9968646167_R06C01,No Chromothripsis,Medulloblastoma,EMBT,12573
2163,6173,3DA60281-C3A8-481C-AB58-B3F18E60CDB0,70610,"infratentoriell, Kleinhirn",4.0,4218,Medulloblastoma,medulloblastoma non-WNT/non-SHH activated,medulloblastoma Group 4,"Medulloblastoma Group 4, subclass VI",...,0.996998,"Medulloblastoma Group 4, subclass VI",0.857598,0.539347,124235.0,9969477088_R02C01,No Chromothripsis,Medulloblastoma,EMBT,362
2164,6174,EB7DCFE3-8E7F-4FF2-9CF9-C1AC9E300CC2,70610,"infratentoriell, Kleinhirn",4.0,4218,Medulloblastoma,medulloblastoma non-WNT/non-SHH activated,medulloblastoma Group 4,"Medulloblastoma Group 4, subclass VI",...,0.996998,"Medulloblastoma Group 4, subclass VI",0.857598,0.539347,124235.0,9969477088_R02C01,No Chromothripsis,Medulloblastoma,EMBT,2423


In [31]:
# np.random.seed(42)

# slide_train, slide_valid = train_test_split(np.array(slide_annots.uuid), train_size = 0.6)

# slide_valid, slide_test = train_test_split(slide_valid, train_size = 0.5)

In [32]:
slide_annots['labels'] = np.abs(1-slide_annots.CT_class.factorize(sort=True)[0])

In [33]:
np.random.seed(42)


kfold = StratifiedKFold(5)
kfold_test_splits = [x for x in kfold.split(np.array(slide_annots.uuid), np.array(slide_annots.labels))]




In [34]:
kfold_train_valid_test_splits = [train_test_split(x[0], train_size=0.75, stratify = np.array(slide_annots.labels)[x[0]]) + [x[1]] for x in kfold_test_splits]




In [35]:
for i in range(5):
    print('starting: '+str(i))
    os.makedirs('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/09082024_EMBT/'+str(i),exist_ok=True)
    slide_train = np.array(slide_annots.uuid)[kfold_train_valid_test_splits[i][0]]
    slide_train_files = path_to_extracted_features + slide_train + '.h5'
    patch_num_train = np.array(slide_annots.patches)[kfold_train_valid_test_splits[i][0]]

    slide_valid = np.array(slide_annots.uuid)[kfold_train_valid_test_splits[i][1]]
    slide_valid_files = path_to_extracted_features + slide_valid + '.h5'
    patch_num_valid = np.array(slide_annots.patches)[kfold_train_valid_test_splits[i][1]]

    slide_test = np.array(slide_annots.uuid)[kfold_train_valid_test_splits[i][2]]
    slide_test_files = path_to_extracted_features + slide_test + '.h5'
    patch_num_test = np.array(slide_annots.patches)[kfold_train_valid_test_splits[i][2]]
    
    
    # patch_num_train = list()

    # for fl in slide_train_files:
    #     if(os.path.exists(fl)):
    #         patch_num_train.append(h5py.File(fl)['feats'].shape[0])
    #     else:
    #         patch_num_train.append(0)
    
    train_df = pd.DataFrame({'slide': slide_train,
                             'features': slide_train_files,
                             'patches': patch_num_train})
    train_df.to_csv('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/09082024_EMBT/'+str(i)+'/train_set.csv')
    
#     patch_num_valid = list()

#     for fl in slide_valid_files:
#         if(os.path.exists(fl)):
#             patch_num_valid.append(h5py.File(fl)['feats'].shape[0])
#         else:
#             patch_num_valid.append(0)
    
    valid_df = pd.DataFrame({'slide': slide_valid,
                             'features': slide_valid_files,
                             'patches': patch_num_valid})
    valid_df.to_csv('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/09082024_EMBT/'+str(i)+'/valid_set.csv')
    
#     patch_num_test = list()

#     for fl in slide_test_files:
#         if(os.path.exists(fl)):
#             patch_num_test.append(h5py.File(fl)['feats'].shape[0])
#         else:
#             patch_num_test.append(0)
    
    test_df = pd.DataFrame({'slide': slide_test,
                             'features': slide_test_files,
                             'patches': patch_num_test})
    test_df.to_csv('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/09082024_EMBT/'+str(i)+'/test_set.csv')
    

starting: 0
starting: 1
starting: 2
starting: 3
starting: 4


In [36]:
[x.shape for x in kfold_train_valid_test_splits[0]]

[(60,), (21,), (21,)]