In [1]:
#!/usr/bin/env python
# coding: utf-8
#NOTE: use paimg9 env

import sys
import os
import numpy as np
import openslide
import torch
import pandas as pd
import warnings
import time
import PIL
sys.path.insert(0, '../Utils/')
from Utils import create_dir_if_not_exists
from FeatureExtractor import PretrainedModelLoader, TileEmbeddingExtractor
warnings.filterwarnings("ignore")

In [2]:
############################################################################################################
#USER INPUT 
############################################################################################################
pixel_overlap = 0      # specify the level of pixel overlap in your saved images
save_image_size = 250
cohort_name = "OPX"  #TAN_TMA_Cores, OPX, TCGA_PRAD
feature_extraction_method = 'uni2' #retccl, uni1, uni2, prov_gigapath
folder_name = "IMSIZE" + str(save_image_size) + "_OL" + str(pixel_overlap)

############################################################################################################
#DIR
############################################################################################################
proj_dir = '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/'
wsi_location_opx = proj_dir + '/data/OPX/'
wsi_location_tan = proj_dir + 'data/TAN_TMA_Cores/'
wsi_location_ccola = proj_dir + '/data/CCola/all_slides/'
wsi_location_tcga = proj_dir + 'data/TCGA_PRAD/'
info_path  = os.path.join(proj_dir,'intermediate_data','2_cancer_detection', cohort_name)
model_path = os.path.join(proj_dir,'models','feature_extraction_models', feature_extraction_method)

out_location = os.path.join(proj_dir,'intermediate_data','4_tile_feature', cohort_name, folder_name)
create_dir_if_not_exists(out_location)

##################
#Select GPU
##################
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/4_tile_feature/OPX/IMSIZE250_OL0' already exists.


In [3]:
############################################################################################################
#Select IDS
############################################################################################################
#Get IDs that are in FT train or already processed to exclude 
fine_tune_ids_df = pd.read_csv(proj_dir + 'intermediate_data/0_cd_finetune/cancer_detection_training/all_tumor_fraction_info.csv')
ft_train_ids = list(fine_tune_ids_df.loc[fine_tune_ids_df['Train_OR_Test'] == 'Train','sample_id']) #24, 7 from OPX, 17 from ccola
toexclude_ids = ft_train_ids + ['cca3af0c-3e0e-4cfb-bb07-459c979a0bd5'] #The latter one is TCGA issue file

#All available IDs
opx_ids = [x.replace('.tif','') for x in os.listdir(wsi_location_opx)] #353
ccola_ids = [x.replace('.svs','') for x in os.listdir(wsi_location_ccola) if '(2017-0133)' in x] #234
tan_ids =  [x.replace('.tif','') for x in os.listdir(wsi_location_tan)] #677
tcga_ids = [x.replace('.svs','') for x in os.listdir(wsi_location_tcga) if x != '.DS_Store'] #449

if cohort_name == "OPX":
    all_ids = opx_ids
elif cohort_name == "ccola":
    all_ids = ccola_ids
elif cohort_name == "TAN_TMA_Cores":
    all_ids = tan_ids
elif cohort_name == 'TCGA_PRAD':
    all_ids = tcga_ids
elif cohort_name == "all":
    all_ids = opx_ids + ccola_ids + tan_ids + tcga_ids
    
#Exclude ids in ft_train or processed
selected_ids = [x for x in all_ids if x not in toexclude_ids]
selected_ids.sort()
print(len(selected_ids))

353


In [None]:
###########################################################################################################
# Load Pretrained representation model
###########################################################################################################
modelloader = PretrainedModelLoader(feature_extraction_method, model_path, device='cuda')
model = modelloader.model

In [37]:
select_idx_start = 200
select_idx_end = 353

In [38]:
selected_ids[select_idx_start:select_idx_end]

['OPX_207',
 'OPX_208',
 'OPX_209',
 'OPX_210',
 'OPX_211',
 'OPX_212',
 'OPX_213',
 'OPX_214',
 'OPX_215',
 'OPX_216',
 'OPX_217',
 'OPX_218',
 'OPX_219',
 'OPX_220',
 'OPX_221',
 'OPX_222',
 'OPX_223',
 'OPX_224',
 'OPX_225',
 'OPX_226',
 'OPX_227',
 'OPX_228',
 'OPX_229',
 'OPX_230',
 'OPX_231',
 'OPX_232',
 'OPX_233',
 'OPX_234',
 'OPX_235',
 'OPX_236',
 'OPX_237',
 'OPX_238',
 'OPX_239',
 'OPX_240',
 'OPX_241',
 'OPX_242',
 'OPX_243',
 'OPX_244',
 'OPX_245',
 'OPX_246',
 'OPX_247',
 'OPX_248',
 'OPX_249',
 'OPX_250',
 'OPX_251',
 'OPX_252',
 'OPX_253',
 'OPX_254',
 'OPX_255',
 'OPX_256',
 'OPX_257',
 'OPX_258',
 'OPX_259',
 'OPX_260',
 'OPX_261',
 'OPX_262',
 'OPX_263',
 'OPX_264',
 'OPX_265',
 'OPX_266',
 'OPX_267',
 'OPX_268',
 'OPX_269',
 'OPX_270',
 'OPX_271',
 'OPX_272',
 'OPX_273',
 'OPX_274',
 'OPX_275',
 'OPX_276',
 'OPX_277',
 'OPX_278',
 'OPX_279',
 'OPX_280',
 'OPX_281',
 'OPX_282',
 'OPX_283',
 'OPX_284',
 'OPX_285',
 'OPX_286',
 'OPX_287',
 'OPX_288',
 'OPX_289',
 'OP

In [None]:
############################################################################################################
#For each patient tile, get representation
############################################################################################################
ct = 0 
for cur_id in selected_ids[select_idx_start:select_idx_end]:
    if ct % 10 == 0: print(ct)

    save_location = os.path.join(out_location, cur_id , 'features')
    create_dir_if_not_exists(save_location)
    save_name = os.path.join(save_location, 'features_alltiles_' + feature_extraction_method + '.h5')
    
    if os.path.exists(save_name) == False: #check if processed 
        if cohort_name == "OPX":
            slides_name = cur_id
            _file = wsi_location_opx + slides_name + ".tif"
        elif cohort_name == "ccola":
            slides_name = cur_id
            _file = wsi_location_ccola + slides_name + '.svs'
        elif cohort_name == "TAN_TMA_Cores":
            slides_name = cur_id
            _file = wsi_location_tan + slides_name + '.tif'
        elif cohort_name == 'TCGA_PRAD':
            slides_name = [f for f in os.listdir(wsi_location_tcga + cur_id + '/') if '.svs' in f][0].replace('.svs','')
            _file = wsi_location_tcga + cur_id + '/' + slides_name + '.svs'


        #Get tile info
        cur_tile_info_df = pd.read_csv(os.path.join(info_path,'IMSIZE250_OL0', cur_id ,'ft_model', slides_name + "_TILE_TUMOR_PERC.csv"))
        print('NOT Processed:',cur_id, "N Tiles:", str(cur_tile_info_df.shape[0]))
        
        #Load slides, and Construct embedding extractor    
        if cohort_name == "OPX" or cohort_name == 'TCGA_PRAD':
            oslide = openslide.OpenSlide(_file) 
            embed_extractor = TileEmbeddingExtractor(cur_tile_info_df, oslide, feature_extraction_method, model, device, image_type = 'WSI')             
            
        elif cohort_name == "TAN_TMA_Cores":      
            tma = PIL.Image.open(_file)
            embed_extractor = TileEmbeddingExtractor(cur_tile_info_df, tma, feature_extraction_method, model, device, image_type = 'TMA')

        #Get feature
        start_time = time.time()
        feature_list = [embed_extractor[i][1] for i in range(cur_tile_info_df.shape[0])]
        print("--- %s seconds ---" % (time.time() - start_time))
        feature_df = np.concatenate(feature_list)
        feature_df = pd.DataFrame(feature_df)    
        # feature_df.to_hdf(save_name, key='feature', mode='w')
        # cur_tile_info_df.to_hdf(save_name, key='tile_info', mode='a')

        ct += 1
    else:
        print('Already Processed:',cur_id)