In [1]:
#NOTE: use paimg9 env
import sys
import os
import numpy as np
import openslide
import pandas as pd
import warnings
import torch
import torch.nn as nn

sys.path.insert(0, '../Utils/')
from Utils import create_dir_if_not_exists, count_label, set_seed
from train_utils import ModelReadyData_diffdim, get_feature_label_array_dynamic_TCGA
warnings.filterwarnings("ignore")

In [2]:
####################################
######      USERINPUT       ########
####################################
SELECTED_LABEL = ["AR","MMR (MSH2, MSH6, PMS2, MLH1, MSH3, MLH3, EPCAM)2","PTEN","RB1","TP53","TMB_HIGHorINTERMEDITATE","MSI_POS"]
TUMOR_FRAC_THRES = 0.9
pixel_overlap = 0    
save_image_size = 250
cohort_name = "TCGA_PRAD"  
feature_extraction_method = 'uni2' #retccl, uni1
folder_name = cohort_name + "/" + "IMSIZE" + str(save_image_size) + "_OL" + str(pixel_overlap) + "/" 

if feature_extraction_method == 'retccl':
    SELECTED_FEATURE = [str(i) for i in range(0,2048)] + ['TUMOR_PIXEL_PERC'] #If retccl 2048, if uni 1024
elif feature_extraction_method == 'uni1': 
    SELECTED_FEATURE = [str(i) for i in range(0,1024)] + ['TUMOR_PIXEL_PERC'] #If retccl 2048, if uni 1024
elif feature_extraction_method == 'uni2':
    SELECTED_FEATURE = [str(i) for i in range(0,1536)] + ['TUMOR_PIXEL_PERC'] #If retccl 2048, if uni 1024

##################
###### DIR  ######
##################
proj_dir = '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/'
feature_path = proj_dir + 'intermediate_data/4_tile_feature/' + folder_name
tumor_info_path =  proj_dir + 'intermediate_data/2_cancer_detection/' + folder_name
mutation_path = os.path.join(proj_dir, 'data', 'MutationCalls', cohort_name)
################################################
#Create output dir
################################################
outdir =  os.path.join(proj_dir + 'intermediate_data/5_model_ready_data', 
                       folder_name, 
                       'feature_' + feature_extraction_method, 
                       'TFT' + str(TUMOR_FRAC_THRES))
create_dir_if_not_exists(outdir)

##################
#Select GPU
##################
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
set_seed(0)

Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/5_model_ready_data/TCGA_PRAD/IMSIZE250_OL0/feature_uni2/TFT0.9' already exists.
cuda:0


In [3]:
############################################################################################################
#Get Mutation label
############################################################################################################
mutation_files = [x for x in os.listdir(mutation_path) if 'tsv' in x]
all_label_dict = {}
for f in mutation_files:
    cur_label = f.split('_')[-1].split('.tsv')[0]
    cur_label_df = pd.read_csv(os.path.join(mutation_path, f), sep='\t')

    if cur_label == '(ATM, ATR, BRCA1, BRCA2, FANCD2, NBN)':
        cur_label = 'MMR (MSH2, MSH6, PMS2, MLH1, MSH3, MLH3, EPCAM)2'
    elif cur_label == '(MLH1, MSH2, MSH6, PMS2)': 
        cur_label = 'MSI_POS'
    else:
        cur_label = cur_label
    cur_label_df[cur_label] = 1
    all_label_dict[cur_label] = list(cur_label_df['Patient ID'])

In [4]:
############################################################################################################
#Select OPX IDS
############################################################################################################
#All available IDs
tcga_sample_ids = [x for x in os.listdir(feature_path) if x != '.DS_Store'] 
tcga_sample_ids.sort()

# tcga_sample_ids = [x for x in tcga_sample_ids if x not in ['8b5e1da6-310b-4a22-b55c-61f423217681',
#                                                            'fa72ee9d-a091-498b-8511-9f4779ace490']]


#Check cancer detection
all_tile_info_list = list()
ct = 0 
for pt in tcga_sample_ids:
    if ct % 50 == 0: print(ct)
    ct += 1
    cur_tile_info = pd.read_hdf(feature_path + pt + '/features/' + 'features_alltiles_' +  feature_extraction_method + '.h5', key='tile_info')
    cur_slides_name = [f for f in os.listdir(os.path.join(tumor_info_path, pt, "ft_model/")) if '.csv' in f][0].replace('_TILE_TUMOR_PERC.csv','')
    cur_tumor_info_df = pd.read_csv(os.path.join(tumor_info_path, pt, "ft_model/", cur_slides_name + "_TILE_TUMOR_PERC.csv"))
    cur_tile_info = cur_tile_info.merge(cur_tumor_info_df, on = ['SAMPLE_ID', 'MAG_EXTRACT', 'SAVE_IMAGE_SIZE', 'PIXEL_OVERLAP',
                                                                 'LIMIT_BOUNDS', 'TILE_XY_INDEXES', 'TILE_COOR_ATLV0', 'WHITE_SPACE',
                                                                 'TISSUE_COVERAGE'])
    cur_tile_info['TCGA_FOLDER_ID'] = pt
    all_tile_info_list.append(cur_tile_info)

all_tile_info = pd.concat(all_tile_info_list)

all_tile_info_thres = all_tile_info.loc[all_tile_info['TUMOR_PIXEL_PERC'] >= TUMOR_FRAC_THRES]

nocancer_ids = [x for x in tcga_sample_ids if x not in list(set(all_tile_info_thres['TCGA_FOLDER_ID']))]
print("No Cancer IDs",nocancer_ids)

toexclude_ids = nocancer_ids + []

#Exclude ids in ft_train or processed
selected_ids = [x for x in tcga_sample_ids if x not in toexclude_ids] #199
print(len(selected_ids))

0
50
100
150
200
250
300
350
400
No Cancer IDs ['2c6fbdb0-2fbb-4881-aa2e-ad3627665576']
447


In [5]:
############################################################################################################
#Get features and labels
############################################################################################################
feature_name = 'features_alltiles_' +  feature_extraction_method 
feature, label, info, tf_info, select_val_ids = get_feature_label_array_dynamic_TCGA(feature_path,
                                                                                     all_label_dict,
                                                                                     tumor_info_path,
                                                                                     feature_name, 
                                                                                     selected_ids, 
                                                                                     SELECTED_LABEL,
                                                                                     SELECTED_FEATURE,
                                                                                     tumor_fraction_thres = TUMOR_FRAC_THRES)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440


In [6]:
torch.save(feature,   outdir + '/TCGA_feature.pth')
torch.save(label,   outdir + '/TCGA_label.pth')
torch.save(info,   outdir + '/TCGA_info.pth')
torch.save(tf_info,   outdir + '/TCGA_tfinfo.pth')
torch.save(select_val_ids,   outdir + '/TCGA_ids.pth')

In [7]:
#TODO: Double check this two, why not match
print(all_tile_info_thres.shape) #930297
check = np.concatenate(info) #927717
check.shape

(1307688, 41)


(1307688, 15)

In [8]:
############################################################################################################
#Count Distribution
############################################################################################################
counts = count_label(label, SELECTED_LABEL, "OPX")
print(counts)
counts.to_csv(outdir + '/TCGA_counts.csv')

                                                  OPX_N0  OPX_N1  OPX_Perc0  \
AR                                                 445.0     2.0       99.6   
MMR (MSH2, MSH6, PMS2, MLH1, MSH3, MLH3, EPCAM)2   413.0    34.0       92.4   
PTEN                                               434.0    13.0       97.1   
RB1                                                444.0     3.0       99.3   
TP53                                               394.0    53.0       88.1   
TMB_HIGHorINTERMEDITATE                              NaN     NaN        NaN   
MSI_POS                                            443.0     4.0       99.1   

                                                  OPX_Perc1  
AR                                                      0.4  
MMR (MSH2, MSH6, PMS2, MLH1, MSH3, MLH3, EPCAM)2        7.6  
PTEN                                                    2.9  
RB1                                                     0.7  
TP53                                                   11

In [9]:
#Get model ready data
data = ModelReadyData_diffdim(feature,label,tf_info,
                                  include_tumor_fraction = False, 
                                  include_cluster = False, 
                                  feature_name =feature_extraction_method)
torch.save(data, outdir + '/TCGA_data.pth')