In [1]:
import numpy as np
import sklearn
import pandas as pd
import h5py

import matplotlib.pyplot as plt

from skimage.transform import resize, pyramid_gaussian
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

import zarr
import torch

import os
import sys
import re

import openslide

from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay



In [2]:
slide_meta = pd.read_csv("../metadata/labels_with_new_batch.csv")
ct_scoring = pd.read_csv("../metadata/CT_3_Class_Draft.csv")



ct_scoring["txt_idat"] = ct_scoring["idat"].astype("str")
ct_scoring.index = ct_scoring.txt_idat
slide_meta.index = slide_meta.idat
ct_scoring = ct_scoring.drop("txt_idat", axis=1)
slide_meta = slide_meta.drop("idat", axis=1)
slide_annots = slide_meta.join(ct_scoring, lsuffix="l")


myx = [x in ["Chromothripsis", "No Chromothripsis"] for x in slide_annots.CT_class]

slide_annots = slide_annots.loc[myx]
slide_names = slide_annots.uuid

# slide_names
slide_annots.CT_class

idat
10003886253_R02C02       Chromothripsis
10003886253_R03C01    No Chromothripsis
10003886256_R03C02    No Chromothripsis
10003886258_R02C01    No Chromothripsis
10003886259_R02C01    No Chromothripsis
                            ...        
9969477124_R05C02     No Chromothripsis
9980102013_R06C01     No Chromothripsis
9980102032_R03C01     No Chromothripsis
9980102032_R04C01     No Chromothripsis
9980102032_R05C01     No Chromothripsis
Name: CT_class, Length: 2215, dtype: object

In [3]:
slide_annots.idat.unique()

array(['10003886253_R02C02', '10003886253_R03C01', '10003886256_R03C02',
       ..., '9980102032_R03C01', '9980102032_R04C01', '9980102032_R05C01'],
      dtype=object)

In [4]:
slide_meta

Unnamed: 0_level_0,uuid,tumor_id,txt_LOKALISATION,num_ALTERSANGABE,patient_id,max_super_family_class,max_family_class,max_class,max_subclass,relevant Histo prediction,possible further consolidation,classifciation_v11,max_cal_v11,family,file_path,slide
idat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
204920830120_R01C01,5E876B19-B3C5-42F0-9034-E171C9185A61,234138,"supratentoriell, temporo-frontal rechts",34.0,111977,Adult-type diffuse gliomas,"diffuse glioma, IDH mutant","diffuse glioma, IDH-mutant and 1p19q retained ...","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant","methylation class IDH glioma, subclass high gr...",0.912418,astrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,5E876B19-B3C5-42F0-9034-E171C9185A61
204920830120_R01C01,FB077233-EDC3-4A52-BD6D-F2F330D7FA62,234138,"supratentoriell, temporo-frontal rechts",34.0,111977,Adult-type diffuse gliomas,"diffuse glioma, IDH mutant","diffuse glioma, IDH-mutant and 1p19q retained ...","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant","methylation class IDH glioma, subclass high gr...",0.912418,astrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,FB077233-EDC3-4A52-BD6D-F2F330D7FA62
204920830120_R01C01,23A493E4-3A63-410B-9659-3AFDF2C366EE,234138,"supratentoriell, temporo-frontal rechts",34.0,111977,Adult-type diffuse gliomas,"diffuse glioma, IDH mutant","diffuse glioma, IDH-mutant and 1p19q retained ...","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant; high grade","Astrocytoma, IDH-mutant","methylation class IDH glioma, subclass high gr...",0.912418,astrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,23A493E4-3A63-410B-9659-3AFDF2C366EE
207011010162_R07C01,7EC4A50F-B422-413F-983E-2418103F347F,326008,"supratentoriell, temporo-insulär rechts",37.0,156108,Adult-type diffuse gliomas,"diffuse glioma, IDH mutant","diffuse glioma, IDH-mutant and 1p19q retained ...","Astrocytoma, IDH-mutant; lower grade","Astrocytoma, IDH-mutant; lower grade","Astrocytoma, IDH-mutant","methylation class IDH glioma, subclass astrocy...",0.934602,astrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,7EC4A50F-B422-413F-983E-2418103F347F
205566000169_R07C01,69DD0320-2930-49DD-9F3A-AAA43519D52A,258410,,,123570,"diffuse glioma, MAPK altered, cell-cycle activ...",pleomorphic xanthoastrocytoma(-like),pleomorphic xanthoastrocytoma(-like),Pleomorphic xanthoastrocytoma,Pleomorphic xanthoastrocytoma,Pleomorphic xanthoastrocytoma,methylation class (anaplastic) pleomorphic xan...,0.973061,pleomorphic xanthoastrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,69DD0320-2930-49DD-9F3A-AAA43519D52A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204339010057_R06C01,872427CC-F757-4CEA-A33F-17B43C872535,173738,"supratentoriell, Thalamus links",44.0,83651,Paediatric-type diffuse high-grade gliomas,"Diffuse pediatric-type high-grade glioma, H3-w...","Diffuse paediatric-type high grade glioma, RTK...","Diffuse paediatric-type high grade glioma, RTK...","Diffuse paediatric-type high grade glioma, RTK...",Diffuse paediatric-type high grade glioma,methylation class CNS neuroblastoma with FOXR2...,0.092167,neuroblastoma,/omics/odcf/analysis/OE0606_projects/pancancer...,872427CC-F757-4CEA-A33F-17B43C872535
206947700046_R07C01,AAB165F9-6A45-4545-BCA3-67A0BE33AB26,317894,,,151989,Ependymal tumours,myxopapillary ependymoma,myxopapillary ependymoma,Myxopapillary ependymoma,Myxopapillary ependymoma,Myxopapillary ependymoma,,,,/omics/odcf/analysis/OE0606_projects/pancancer...,AAB165F9-6A45-4545-BCA3-67A0BE33AB26
9741950087_R03C02,CADDF04F-C9A1-4A84-A840-1AA1053A982D,67754,"supratentoriell, frontal links",31.0,2105,Adult-type diffuse gliomas,"diffuse glioma, IDH mutant","diffuse glioma, IDH-mutant and 1p19q retained ...","Astrocytoma, IDH-mutant; lower grade","Astrocytoma, IDH-mutant; lower grade","Astrocytoma, IDH-mutant","methylation class IDH glioma, subclass astrocy...",0.997145,astrocytoma,/omics/odcf/analysis/OE0606_projects/pancancer...,CADDF04F-C9A1-4A84-A840-1AA1053A982D
205059630019_R05C01,24CAA1E6-EA6E-4EB5-81FB-39C2740795ED,254654,intrazerebral,86.0,121779,Adult-type diffuse gliomas,"Glioblastoma, IDH-wildtype","glioblastoma, IDH-wildtype, RTK1 type","Glioblastoma, IDH-wildtype, RTK1 subtype","Glioblastoma, IDH-wildtype, RTK1 subtype","glioblastoma, IDH-wildtype","methylation class glioblastoma, IDH wildtype, ...",0.562538,glioblastoma,/omics/odcf/analysis/OE0606_projects/pancancer...,24CAA1E6-EA6E-4EB5-81FB-39C2740795ED


In [9]:
annotated_slides = slide_meta.uuid + '.pt'

In [5]:
path_to_extracted_features = '/omics/odcf/analysis/OE0606_projects/pancancer_histopathology/analysis/shared_playground/CNS_classification/embeddings/UNI_256_1024_UKHD_FULL_dataset/pt_files/'

all_extracted_slides = os.listdir(path_to_extracted_features)

In [8]:
all_extracted_slides[0:5]

['CFACBADB-5F8E-44B2-8118-5870D150CA21.pt',
 '1B36CE7A-F577-46EB-AF2C-5B00C329C5F5.pt',
 '40F25E94-4533-4D62-B372-03ABFDEA52B4.pt',
 '51E399E7-38DB-40FB-8E27-E49F8D2C40A3.pt',
 'FF5CECEA-7D4E-425E-88E2-B042FC3846FF.pt']

In [12]:
missing_slides = [x for x in all_extracted_slides if x not in annotated_slides.tolist()]

In [13]:
len(missing_slides)

1888

In [15]:
missing_slides = [x.strip('.pt') for x in missing_slides]


In [21]:
with open('missing_slide_uuids.txt', 'w') as fl:
    for slide in missing_slides:
        fl.write(slide)
        fl.write('\n')
