In [None]:
import numpy as np
import sklearn
import pandas as pd
import h5py

import matplotlib.pyplot as plt

from skimage.transform import resize, pyramid_gaussian
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

import zarr
import torch

import os
import sys
import re

import openslide

from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay



In [None]:
slide_meta = pd.read_csv("../metadata/labels_with_new_batch.csv")
ct_scoring = pd.read_csv("../metadata/CT_3_Class_Draft.csv")



ct_scoring["txt_idat"] = ct_scoring["idat"].astype("str")
ct_scoring.index = ct_scoring.txt_idat
slide_meta.index = slide_meta.idat
ct_scoring = ct_scoring.drop("txt_idat", axis=1)
slide_meta = slide_meta.drop("idat", axis=1)
slide_annots = slide_meta.join(ct_scoring, lsuffix="l")


myx = [x in ["Chromothripsis", "No Chromothripsis"] for x in slide_annots.CT_class]

slide_annots = slide_annots.loc[myx]
slide_names = slide_annots.uuid

# slide_names
slide_annots.CT_class

idat
10003886253_R02C02       Chromothripsis
10003886253_R03C01    No Chromothripsis
10003886256_R03C02    No Chromothripsis
10003886258_R02C01    No Chromothripsis
10003886259_R02C01    No Chromothripsis
                            ...        
9969477124_R05C02     No Chromothripsis
9980102013_R06C01     No Chromothripsis
9980102032_R03C01     No Chromothripsis
9980102032_R04C01     No Chromothripsis
9980102032_R05C01     No Chromothripsis
Name: CT_class, Length: 2215, dtype: object

In [None]:
slide_annots.idat.unique()

array(['10003886253_R02C02', '10003886253_R03C01', '10003886256_R03C02',
       ..., '9980102032_R03C01', '9980102032_R04C01', '9980102032_R05C01'],
      dtype=object)

In [None]:
slide_meta

Unnamed: 0_level_0,uuid,tumor_id,txt_LOKALISATION,num_ALTERSANGABE,patient_id,max_super_family_class,max_family_class,max_class,max_subclass,relevant Histo prediction,possible further consolidation,classifciation_v11,max_cal_v11,family,file_path,slide
idat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
204920830120_R01C01,5E876B19-B3C5-42F0-9034-E171C9185A61,234138,"supratentoriell, temporo-frontal rechts",34.0,111977,Adult-type diffuse gliomas,"diffuse glioma, IDH mutant","diffuse glioma, IDH-mutant and 1p19q retained ...","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant","methylation class IDH glioma, subclass high gr...",0.912418,astrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,5E876B19-B3C5-42F0-9034-E171C9185A61
204920830120_R01C01,FB077233-EDC3-4A52-BD6D-F2F330D7FA62,234138,"supratentoriell, temporo-frontal rechts",34.0,111977,Adult-type diffuse gliomas,"diffuse glioma, IDH mutant","diffuse glioma, IDH-mutant and 1p19q retained ...","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant","methylation class IDH glioma, subclass high gr...",0.912418,astrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,FB077233-EDC3-4A52-BD6D-F2F330D7FA62
204920830120_R01C01,23A493E4-3A63-410B-9659-3AFDF2C366EE,234138,"supratentoriell, temporo-frontal rechts",34.0,111977,Adult-type diffuse gliomas,"diffuse glioma, IDH mutant","diffuse glioma, IDH-mutant and 1p19q retained ...","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant","methylation class IDH glioma, subclass high gr...",0.912418,astrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,23A493E4-3A63-410B-9659-3AFDF2C366EE
207011010162_R07C01,7EC4A50F-B422-413F-983E-2418103F347F,326008,"supratentoriell, temporo-insulär rechts",37.0,156108,Adult-type diffuse gliomas,"diffuse glioma, IDH mutant","diffuse glioma, IDH-mutant and 1p19q retained ...","Astrocytoma, IDH-mutant; lower grade","Astrocytoma, IDH-mutant; lower grade","Astrocytoma, IDH-mutant","methylation class IDH glioma, subclass astrocy...",0.934602,astrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,7EC4A50F-B422-413F-983E-2418103F347F
205566000169_R07C01,69DD0320-2930-49DD-9F3A-AAA43519D52A,258410,,,123570,"diffuse glioma, MAPK altered, cell-cycle activ...",pleomorphic xanthoastrocytoma(-like),pleomorphic xanthoastrocytoma(-like),Pleomorphic xanthoastrocytoma,Pleomorphic xanthoastrocytoma,Pleomorphic xanthoastrocytoma,methylation class (anaplastic) pleomorphic xan...,0.973061,pleomorphic xanthoastrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,69DD0320-2930-49DD-9F3A-AAA43519D52A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204339010057_R06C01,872427CC-F757-4CEA-A33F-17B43C872535,173738,"supratentoriell, Thalamus links",44.0,83651,Paediatric-type diffuse high-grade gliomas,"Diffuse pediatric-type high-grade glioma, H3-w...","Diffuse paediatric-type high grade glioma, RTK...","Diffuse paediatric-type high grade glioma, RTK...","Diffuse paediatric-type high grade glioma, RTK...",Diffuse paediatric-type high grade glioma,methylation class CNS neuroblastoma with FOXR2...,0.092167,neuroblastoma,/omics/odcf/analysis/OE0606_projects/pancancer...,872427CC-F757-4CEA-A33F-17B43C872535
206947700046_R07C01,AAB165F9-6A45-4545-BCA3-67A0BE33AB26,317894,,,151989,Ependymal tumours,myxopapillary ependymoma,myxopapillary ependymoma,Myxopapillary ependymoma,Myxopapillary ependymoma,Myxopapillary ependymoma,,,,/omics/odcf/analysis/OE0606_projects/pancancer...,AAB165F9-6A45-4545-BCA3-67A0BE33AB26
9741950087_R03C02,CADDF04F-C9A1-4A84-A840-1AA1053A982D,67754,"supratentoriell, frontal links",31.0,2105,Adult-type diffuse gliomas,"diffuse glioma, IDH mutant","diffuse glioma, IDH-mutant and 1p19q retained ...","Astrocytoma, IDH-mutant; lower grade","Astrocytoma, IDH-mutant; lower grade","Astrocytoma, IDH-mutant","methylation class IDH glioma, subclass astrocy...",0.997145,astrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,CADDF04F-C9A1-4A84-A840-1AA1053A982D
205059630019_R05C01,24CAA1E6-EA6E-4EB5-81FB-39C2740795ED,254654,intrazerebral,86.0,121779,Adult-type diffuse gliomas,"Glioblastoma, IDH-wildtype","glioblastoma, IDH-wildtype, RTK1 type","Glioblastoma, IDH-wildtype, RTK1 subtype","Glioblastoma, IDH-wildtype, RTK1 subtype","glioblastoma, IDH-wildtype","methylation class glioblastoma, IDH wildtype, ...",0.562538,glioblastoma,/omics/odcf/analysis/OE0606_projects/pancancer...,24CAA1E6-EA6E-4EB5-81FB-39C2740795ED


In [None]:
path_to_extracted_features = '/omics/odcf/analysis/OE0606_projects/pancancer_histopathology/analysis/shared_playground/CNS_classification/embeddings/UNI_256_1024_UKHD_FULL_dataset/pt_files/'


patch_num = []
for x in slide_annots.uuid:
    fl = path_to_extracted_features+x+".pt"
    if(os.path.exists(fl)):
        patch_num.append(torch.load(fl).shape[0])
    else:
        patch_num.append(0)


In [7]:
slide_annots['patches'] = patch_num

In [8]:
slide_annots

Unnamed: 0_level_0,uuid,tumor_id,txt_LOKALISATION,num_ALTERSANGABE,patient_id,max_super_family_class,max_family_class,max_class,max_subclass,relevant Histo prediction,possible further consolidation,classifciation_v11,max_cal_v11,family,file_path,slide,Unnamed: 0,idat,CT_class,patches
idat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
10003886253_R02C02,1EE2D6AD-2F2A-4457-BA86-4CC8D137249D,73896,"infratentoriell, sakral",30.0,4981,Ependymal tumours,myxopapillary ependymoma,myxopapillary ependymoma,Myxopapillary ependymoma,Myxopapillary ependymoma,Myxopapillary ependymoma,"methylation class ependymoma, myxopapillary",0.996477,ependymoma,/omics/odcf/analysis/OE0606_projects/pancancer...,1EE2D6AD-2F2A-4457-BA86-4CC8D137249D,41.0,10003886253_R02C02,Chromothripsis,16213
10003886253_R03C01,9625ECC7-5AA1-4ADF-B99E-B2F959A2317E,73878,"spinal, TH 11/12 extramedullär",71.0,5395,"Mesenchymal, non-meningothelial tumours involv...",Fibroblastic and myofibroblastic tumours,solitary fibrous tumour / haemangiopericytoma,Solitary fibrous tumour / haemangiopericytoma,Solitary fibrous tumour / haemangiopericytoma,Solitary fibrous tumour / haemangiopericytoma,methylation class solitary fibrous tumor / hem...,0.999968,hemangiopericytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,9625ECC7-5AA1-4ADF-B99E-B2F959A2317E,42.0,10003886253_R03C01,No Chromothripsis,6040
10003886256_R03C02,A0517565-0BA3-43F8-A54C-33F403262927,73948,infratentoriell,34.0,5460,Control tissues,"control tissue, reactive tumour microenvironment","control tissue, reactive tumour microenvironment","Control tissue, reactive tumour microenvironment","Control tissue, reactive tumour microenvironment","Control tissue, reactive tumour microenvironment","methylation class low grade glioma, subclass p...",0.149055,PA and other MAPK LGGNT,/omics/odcf/analysis/OE0606_projects/pancancer...,A0517565-0BA3-43F8-A54C-33F403262927,55.0,10003886256_R03C02,No Chromothripsis,19018
10003886258_R02C01,BCEBE5E3-E482-4E27-82FD-A2FFF12F5C84,74034,"infratentoriell, hintere Schädelgrube",14.0,5457,Medulloblastoma,"medulloblastoma, WNT activated","medulloblastoma, WNT activated","Medulloblastoma, WNT activated","Medulloblastoma, WNT activated","Medulloblastoma, WNT activated","methylation class medulloblastoma, WNT",0.999083,medulloblastoma_WNT,/omics/odcf/analysis/OE0606_projects/pancancer...,BCEBE5E3-E482-4E27-82FD-A2FFF12F5C84,64.0,10003886258_R02C01,No Chromothripsis,3811
10003886259_R02C01,88FB4DE0-39AE-4FD6-ACFA-68C35A57669F,74022,"supratentoriell, frontal rechts",11.0,5459,Low-grade glial/glioneuronal/neuroepithelial t...,low-grade glioneuronal tumour,dysembryoplastic neuroepithelial tumour,Dysembryoplastic neuroepithelial tumour,Dysembryoplastic neuroepithelial tumour,Dysembryoplastic neuroepithelial tumour,"methylation class low grade glioma, dysembryop...",0.999950,PA and other MAPK LGGNT,,88FB4DE0-39AE-4FD6-ACFA-68C35A57669F,76.0,10003886259_R02C01,No Chromothripsis,7359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9969477124_R05C02,24D39BD4-471B-43E7-A23D-1647F9DD9F54,67402,,21.0,2039,meningioma,meningioma,"meningioma, benign","Meningioma, subclass benign 1","meningioma, benign","meningioma, benign",methylation class meningioma,0.999110,meningioma,/omics/odcf/analysis/OE0606_projects/pancancer...,24D39BD4-471B-43E7-A23D-1647F9DD9F54,124466.0,9969477124_R05C02,No Chromothripsis,3617
9980102013_R06C01,95C1EFE3-112A-48EF-A15C-F7A39AC13247,71960,"supratentoriell, frontal",9.0,4583,Paediatric-type diffuse high-grade gliomas,"Diffuse pediatric-type high-grade glioma, H3-w...","Diffuse paediatric-type high grade glioma, RTK...","Diffuse paediatric-type high grade glioma, RTK...","Diffuse paediatric-type high grade glioma, RTK...",Diffuse paediatric-type high grade glioma,"methylation class low grade glioma, subclass h...",0.427709,PA and other MAPK LGGNT,/omics/odcf/analysis/OE0606_projects/pancancer...,95C1EFE3-112A-48EF-A15C-F7A39AC13247,124579.0,9980102013_R06C01,No Chromothripsis,11189
9980102032_R03C01,4C83D422-9DC5-4E3A-A120-6E2DEB47D553,71988,"infratentoriell, cerebellär links",27.0,4799,Medulloblastoma,"medulloblastoma, SHH-activated","medulloblastoma, SHH-activated, subtype 4","Medulloblastoma, SHH-activated, subtype 4","medulloblastoma, SHH-activated","medulloblastoma, SHH-activated","methylation class medulloblastoma, subclass SH...",0.999697,medulloblastoma_SHH,/omics/odcf/analysis/OE0606_projects/pancancer...,4C83D422-9DC5-4E3A-A120-6E2DEB47D553,124585.0,9980102032_R03C01,No Chromothripsis,18452
9980102032_R04C01,6936A1E9-D6D8-4A44-9CA7-989485943185,71990,"supratentoriell, Thalamus",10.0,4798,Control tissues,"control tissue, reactive tumour microenvironment","control tissue, reactive tumour microenvironment","Control tissue, reactive tumour microenvironment","Control tissue, reactive tumour microenvironment","Control tissue, reactive tumour microenvironment","methylation class control tissue, reactive brain",0.934114,control,/omics/odcf/analysis/OE0606_projects/pancancer...,6936A1E9-D6D8-4A44-9CA7-989485943185,124587.0,9980102032_R04C01,No Chromothripsis,22356


In [7]:
np.random.seed(42)

slide_train, slide_valid = train_test_split(np.array(slide_annots.uuid), train_size = 0.6)

slide_valid, slide_test = train_test_split(slide_valid, train_size = 0.5)

In [9]:
slide_annots['labels'] = np.abs(1-slide_annots.CT_class.factorize(sort=True)[0])

In [10]:
np.random.seed(42)


kfold = StratifiedKFold(5)
kfold_test_splits = [x for x in kfold.split(np.array(slide_annots.uuid), np.array(slide_annots.labels))]




In [11]:
kfold_train_valid_test_splits = [train_test_split(x[0], train_size=0.75, stratify = np.array(slide_annots.labels)[x[0]]) + [x[1]] for x in kfold_test_splits]




In [19]:
for i in range(5):
    print('starting: '+str(i))
    os.makedirs('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/15052024_UNI/'+str(i),exist_ok=True)
    slide_train = np.array(slide_annots.uuid)[kfold_train_valid_test_splits[i][0]]
    slide_train_files = path_to_extracted_features + slide_train + '.pt'
    patch_num_train = np.array(slide_annots.patches)[kfold_train_valid_test_splits[i][0]]

    slide_valid = np.array(slide_annots.uuid)[kfold_train_valid_test_splits[i][1]]
    slide_valid_files = path_to_extracted_features + slide_valid + '.pt'
    patch_num_valid = np.array(slide_annots.patches)[kfold_train_valid_test_splits[i][1]]

    slide_test = np.array(slide_annots.uuid)[kfold_train_valid_test_splits[i][2]]
    slide_test_files = path_to_extracted_features + slide_test + '.pt'
    patch_num_test = np.array(slide_annots.patches)[kfold_train_valid_test_splits[i][2]]

    
#     patch_num_train = list()

#     for fl in slide_train_files:
#         if(os.path.exists(fl)):
#             patch_num_train.append(torch.load(fl).shape[0])
#         else:
#             patch_num_train.append(0)
    
    train_df = pd.DataFrame({'slide': slide_train,
                             'features': slide_train_files,
                             'patches': patch_num_train})
    train_df.to_csv('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/15052024_UNI/'+str(i)+'/train_set.csv')
    
#     patch_num_valid = list()

#     for fl in slide_valid_files:
#         if(os.path.exists(fl)):
#             patch_num_valid.append(torch.load(fl).shape[0])
#         else:
#             patch_num_valid.append(0)
    
    valid_df = pd.DataFrame({'slide': slide_valid,
                             'features': slide_valid_files,
                             'patches': patch_num_valid})
    valid_df.to_csv('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/15052024_UNI/'+str(i)+'/valid_set.csv')
    
#     patch_num_test = list()

#     for fl in slide_test_files:
#         if(os.path.exists(fl)):
#             patch_num_test.append(torch.load(fl).shape[0])
#         else:
#             patch_num_test.append(0)
    
    test_df = pd.DataFrame({'slide': slide_test,
                             'features': slide_test_files,
                             'patches': patch_num_test})
    test_df.to_csv('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/15052024_UNI/'+str(i)+'/test_set.csv')
    

starting: 0
starting: 1
starting: 2
starting: 3
starting: 4


In [18]:
len(patch_num_train)

0

In [30]:
i = 0
slide_train = path_to_extracted_features+np.array(slide_annots.uuid)[kfold_train_valid_test_splits[i][0]] + '.pt'

patch_num_train = list()

for fl in slide_train:
    if(os.path.exists(fl)):
        patch_num_train.append(h5py.File(fl)['feats'].shape[0])
    else:
        patch_num_train.append(0)


train_df = pd.DataFrame({'slide': slide_train,
             'patches': patch_num_train})


In [33]:
train_df.to_csv('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/02022024/'+str(i)+'/train_set.csv')

',slide,patches\n0,/home/p163v/histopathology/UKHD_Neuro/RetCLL_Features/EDA11A46-8944-4488-B491-5A20A937BC92.h5,14822\n1,/home/p163v/histopathology/UKHD_Neuro/RetCLL_Features/0B86F403-C968-43C8-A6C7-FB22826978AD.h5,6664\n2,/home/p163v/histopathology/UKHD_Neuro/RetCLL_Features/45992A98-BCD4-4FCF-9A62-3CE2D8E9E680.h5,50\n3,/home/p163v/histopathology/UKHD_Neuro/RetCLL_Features/EF16DACE-26EA-4D17-9C93-295E8F8550E7.h5,2316\n4,/home/p163v/histopathology/UKHD_Neuro/RetCLL_Features/1AB9E8D7-FD1D-478D-A2BC-B9728F3452E9.h5,7328\n5,/home/p163v/histopathology/UKHD_Neuro/RetCLL_Features/EC42213A-E4A7-4668-9CFA-F3C0C12938DD.h5,5055\n6,/home/p163v/histopathology/UKHD_Neuro/RetCLL_Features/727AC7CA-9C36-4C98-B767-86C088A55653.h5,1721\n7,/home/p163v/histopathology/UKHD_Neuro/RetCLL_Features/32B5574A-3379-43D1-9878-43E35C1FEACA.h5,2540\n8,/home/p163v/histopathology/UKHD_Neuro/RetCLL_Features/BC7E390E-A1A5-429C-9FD3-3106E5179368.h5,8582\n9,/home/p163v/histopathology/UKHD_Neuro/RetCLL_Features/3A2C2EE9-D

In [15]:
[x.shape for x in kfold_train_valid_test_splits[0]]

[(1329,), (443,), (443,)]

In [12]:
patch_num_train = list()


for x in slide_train:
    fl = path_to_extracted_features+x+".h5"
    if(os.path.exists(fl)):
        patch_num_train.append(h5py.File(fl)['feats'].shape[0])
    else:
        patch_num_train.append(0)



patch_num_valid = list()

for x in slide_valid:
    fl = path_to_extracted_features+x+".h5"
    if(os.path.exists(fl)):
        patch_num_valid.append(h5py.File(fl)['feats'].shape[0])
    else:
        patch_num_valid.append(0)


patch_num_test = list()

for x in slide_test:
    fl = path_to_extracted_features+x+".h5"
    if(os.path.exists(fl)):
        patch_num_test.append(h5py.File(fl)['feats'].shape[0])
    else:
        patch_num_test.append(0)

      

In [41]:
with open('../metadata/train_set_02022024_01.txt', "w") as f:
    f.write('File,patches\n')
    for i in range(len(slide_train)):
        f.write(slide_train[i]+'.h5,')
        f.write(str(patch_num_train[i]) + '\n')
        

with open('../metadata/valid_set_02022024_01.txt', "w") as f:
    f.write('File,patches\n')
    for i in range(len(slide_valid)):
        f.write(slide_valid[i]+'.h5,')
        f.write(str(patch_num_valid[i]) + '\n')

with open('../metadata/test_set_02022024_01.txt', "w") as f:
    f.write('File,patches\n')
    for i in range(len(slide_test)):
        f.write(slide_test[i]+'.h5,')        
        f.write(str(patch_num_test[i]) + '\n')


