In [1]:
import numpy as np
import sklearn
import pandas as pd
import h5py
import glob

import matplotlib.pyplot as plt

from skimage.transform import resize, pyramid_gaussian
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

import zarr

import os
import sys
import re

import openslide

from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay



In [2]:
ct_scoring = pd.read_csv("/home/p163v/MethCTScoring/snp6_CT_labels_11_1MB_10.csv")


In [3]:
duplicated_cases = ct_scoring.loc[ct_scoring.TCGA_Case.duplicated()].TCGA_Case
for case in duplicated_cases:
    ct_scoring.loc[ct_scoring.TCGA_Case == case]
    if np.any(ct_scoring.loc[ct_scoring.TCGA_Case == case].CT_Status == "Chromothripsis"):
        ct_scoring.loc[ct_scoring.TCGA_Case == case,"CT_Status"] = "Chromothripsis"
    elif np.any(ct_scoring.loc[ct_scoring.TCGA_Case == case].CT_Status == "Unsure"):
        ct_scoring.loc[ct_scoring.TCGA_Case == case,"CT_Status"] = "Unsure"
    else: 
         ct_scoring.loc[ct_scoring.TCGA_Case == case,"CT_Status"] = "No Chromothripsis"


In [4]:
ct_scoring = ct_scoring.loc[~ct_scoring.TCGA_Case.duplicated()]

In [5]:
tcga_projects = [os.path.basename(x).split("_")[0].split("-")[1] for x in glob.glob("../metadata/TCGA/slides_list/*_primary.txt")]

In [6]:
tcga_projects

['COAD',
 'HNSC',
 'MESO',
 'SKCM',
 'PAAD',
 'TGCT',
 'CESC',
 'CHOL',
 'OV',
 'GBM',
 'LGG',
 'LUAD',
 'THCA',
 'SARC',
 'DLBC',
 'BRCA',
 'THYM',
 'ACC',
 'KIRP',
 'KIRC',
 'PCPG',
 'UCEC',
 'READ',
 'UVM',
 'LIHC',
 'BLCA',
 'KICH',
 'UCS',
 'ESCA',
 'STAD',
 'LUSC',
 'PRAD']

In [7]:
tcga_file_lists = [pd.read_csv(x, header=None) for x in glob.glob("../metadata/TCGA/slides_list/*_primary.txt")]

In [8]:
for x,y in zip(tcga_file_lists, tcga_projects):
    x.loc[:,"project"] = y


In [9]:
tcga_slide_map = pd.concat(tcga_file_lists)

In [10]:
tcga_slide_map.loc[:, "PatientID"] = ["-".join(x.split("-")[0:3]) for x in tcga_slide_map.loc[:,0]]

In [11]:
tcga_slide_map.loc[:,"slide_id"] = [x.split(".")[0] for x in tcga_slide_map.loc[:,0]]

In [12]:
tcga_slide_map

Unnamed: 0,0,project,PatientID,slide_id
0,TCGA-3L-AA1B-01Z-00-DX1.8923A151-A690-40B7-9E5...,COAD,TCGA-3L-AA1B,TCGA-3L-AA1B-01Z-00-DX1
1,TCGA-3L-AA1B-01Z-00-DX2.17CE3683-F4B1-4978-A28...,COAD,TCGA-3L-AA1B,TCGA-3L-AA1B-01Z-00-DX2
2,TCGA-4N-A93T-01Z-00-DX1.82E240B1-22C3-46E3-891...,COAD,TCGA-4N-A93T,TCGA-4N-A93T-01Z-00-DX1
3,TCGA-4N-A93T-01Z-00-DX2.875E7F95-A6D4-4BEB-A33...,COAD,TCGA-4N-A93T,TCGA-4N-A93T-01Z-00-DX2
4,TCGA-4T-AA8H-01Z-00-DX1.A46C759C-74A2-4724-B6B...,COAD,TCGA-4T-AA8H,TCGA-4T-AA8H-01Z-00-DX1
...,...,...,...,...
425,TCGA-YL-A9WJ-01Z-00-DX1.ECE31D79-8A8E-45DB-8BC...,PRAD,TCGA-YL-A9WJ,TCGA-YL-A9WJ-01Z-00-DX1
426,TCGA-YL-A9WK-01Z-00-DX1.8541045E-7FD4-4E74-BA1...,PRAD,TCGA-YL-A9WK,TCGA-YL-A9WK-01Z-00-DX1
427,TCGA-YL-A9WL-01Z-00-DX1.4EE10C9A-18EA-4DB0-BE2...,PRAD,TCGA-YL-A9WL,TCGA-YL-A9WL-01Z-00-DX1
428,TCGA-YL-A9WX-01Z-00-DX1.20B40B9A-C210-419D-B9A...,PRAD,TCGA-YL-A9WX,TCGA-YL-A9WX-01Z-00-DX1


In [13]:
tcga_slide_map.loc[tcga_slide_map.slide_id == "TCGA-06-0152-01Z-00-DX6"]


Unnamed: 0,0,project,PatientID,slide_id
141,TCGA-06-0152-01Z-00-DX6.b20146eb-b6a1-4f9f-842...,GBM,TCGA-06-0152,TCGA-06-0152-01Z-00-DX6


In [14]:
slide_annots = pd.merge(ct_scoring, tcga_slide_map, how="outer", left_on = "TCGA_Case", right_on = "PatientID")

In [15]:
slide_annots.slide_id

0        TCGA-DC-4745-01Z-00-DX1
1        TCGA-EW-A1OV-01Z-00-DX1
2        TCGA-02-0003-01Z-00-DX1
3        TCGA-02-0003-01Z-00-DX2
4        TCGA-02-0003-01Z-00-DX3
                  ...           
12955    TCGA-HC-7749-01Z-00-DX1
12956    TCGA-HC-8212-01Z-00-DX1
12957    TCGA-KK-A5A1-01Z-00-DX1
12958    TCGA-YL-A8HO-01Z-00-DX1
12959    TCGA-YL-A9WK-01Z-00-DX1
Name: slide_id, Length: 12960, dtype: object

In [16]:
slide_annots.dropna(axis = 0, how = 'any', inplace = True)

In [None]:
path_to_extracted_features = '/home/p163v/histopathology/TCGA/ffpe/299/'


patch_num = []
for x in slide_annots.slide_id:
    fl = path_to_extracted_features+x+".h5"
    if(os.path.exists(fl)):
        patch_num.append(h5py.File(fl)['feats'].shape[0])
    else:
        patch_num.append(0)

In [None]:
slide_annots['patches'] = patch_num

In [None]:
slide_annots

In [None]:
slide_annots = slide_annots[slide_annots.patches >0]

In [None]:
slide_annots = slide_annots[slide_annots.CT_Status != "Unsure"]

In [None]:
slide_annots.TCGA_Case.unique().shape

In [None]:
slide_annots.loc[:,'labels'] = np.abs(1-slide_annots.CT_Status.factorize(sort=True)[0])

In [None]:
unique_patient_labels = slide_annots.loc[:,["TCGA_Case","labels"]]
unique_patient_labels = unique_patient_labels.drop_duplicates(ignore_index=True)
unique_patient_labels

In [None]:
np.random.seed(42)


kfold = StratifiedKFold(5)
kfold_test_splits = [x for x in kfold.split(np.array(unique_patient_labels.TCGA_Case), np.array(unique_patient_labels.labels))]




In [None]:
kfold_train_valid_test_splits = [train_test_split(x[0], train_size=0.75, stratify = np.array(slide_annots.labels)[x[0]]) + [x[1]] for x in kfold_test_splits]




In [None]:
i = 0
for i in range(5):
    print('starting: '+str(i))
    os.makedirs('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/TCGA/18032024/'+str(i),exist_ok=True)
    train_table = pd.merge(unique_patient_labels.loc[kfold_train_valid_test_splits[i][0], "TCGA_Case"], slide_annots, how="left") 
    valid_table = pd.merge(unique_patient_labels.loc[kfold_train_valid_test_splits[i][1], "TCGA_Case"], slide_annots, how="left") 
    test_table = pd.merge(unique_patient_labels.loc[kfold_train_valid_test_splits[i][2], "TCGA_Case"], slide_annots, how="left") 
    train_table['features'] = path_to_extracted_features + train_table.slide_id + ".h5"
    valid_table['features'] = path_to_extracted_features + valid_table.slide_id + ".h5"
    test_table['features'] = path_to_extracted_features + test_table.slide_id + ".h5"

    train_table.to_csv('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/TCGA/18032024/'+str(i)+'/train_set.csv')
    valid_table.to_csv('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/TCGA/18032024/'+str(i)+'/valid_set.csv')
    test_table.to_csv('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/splits/TCGA/18032024/'+str(i)+'/test_set.csv')




In [None]:
slide_annots.to_csv('/omics/odcf/analysis/OE0585_projects/chromothripsis/histopathology/metadata/tcga_labeled_data.csv')

In [None]:
slide_annots