In [1]:
#!/usr/bin/env python
# coding: utf-8
#NOTE: use paimg9 env

import sys
import os
import numpy as np
import openslide
from fastai.vision.all import *
matplotlib.use('Agg')
import pandas as pd
import warnings
sys.path.insert(0, '../Utils/')
from Preprocessing import preprocess_mutation_data, preprocess_site_data, get_tile_representation, get_tile_representation_tma
from Utils import generate_deepzoom_tiles
from Utils import create_dir_if_not_exists
warnings.filterwarnings("ignore")

import ResNet as ResNet
import time
import PIL 
import timm

In [2]:
############################################################################################################
#USER INPUT 
############################################################################################################
pixel_overlap = 0      # specify the level of pixel overlap in your saved images
save_image_size = 250
limit_bounds = True     # this is weird, dont change it
cohort_name = "TAN_TMA_Cores"  #TAN_TMA_Cores
feature_extraction_method = 'uni1' #retccl, uni1, uni2
folder_name = cohort_name + "/" + "IMSIZE" + str(save_image_size) + "_OL" + str(pixel_overlap) + "/" 


if feature_extraction_method == 'retccl':
    resize_transform = False
else:
    resize_transform = True
############################################################################################################
#DIR
############################################################################################################
proj_dir = '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/'
model_path = proj_dir + 'models/feature_extraction_models/' + feature_extraction_method + '/'

wsi_location_opx = proj_dir + '/data/OPX/'
wsi_location_tan = proj_dir + 'data/TAN_TMA_Cores/'
wsi_location_ccola = proj_dir + '/data/CCola/all_slides/'
tile_info_path = proj_dir + 'intermediate_data/3_updated_tile_info/'+ folder_name

out_location = proj_dir + 'intermediate_data/4_tile_feature/'+ folder_name
create_dir_if_not_exists(out_location)

##################
#Select GPU
##################
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/4_tile_feature/TAN_TMA_Cores/IMSIZE250_OL0/' already exists.


In [3]:
############################################################################################################
#Select IDS
############################################################################################################
#Get IDs that are in FT train or already processed to exclude 
fine_tune_ids_df = pd.read_csv(proj_dir + 'intermediate_data/0_cd_finetune/cancer_detection_training/all_tumor_fraction_info.csv')
ft_train_ids = list(fine_tune_ids_df.loc[fine_tune_ids_df['Train_OR_Test'] == 'Train','sample_id']) #24, 7 from OPX, 17 from ccola
toexclude_ids = ft_train_ids 

#All available IDs
opx_ids = [x.replace('.tif','') for x in os.listdir(wsi_location_opx)] #217
ccola_ids = [x.replace('.svs','') for x in os.listdir(wsi_location_ccola) if '(2017-0133)' in x] #234
tan_ids =  [x.replace('.tif','') for x in os.listdir(wsi_location_tan)] #677

if cohort_name == "OPX":
    all_ids = opx_ids
elif cohort_name == "ccola":
    all_ids = ccola_ids
elif cohort_name == "TAN_TMA_Cores":
    all_ids = tan_ids
elif cohort_name == "all":
    all_ids = opx_ids + ccola_ids + tan_ids

#Exclude ids in ft_train or processed
selected_ids = [x for x in all_ids if x not in toexclude_ids] #209 for 
selected_ids.sort()

In [4]:
################################################
#Load tile info 
################################################
tile_info_df = pd.read_csv(tile_info_path + "all_tile_info.csv")

In [5]:
############################################################################################################
# Load Pretrained representation model
############################################################################################################
if feature_extraction_method == 'retccl':
    model = ResNet.resnet50(num_classes=128,mlp=False, two_branch=False, normlinear=True)
    pretext_model = torch.load(model_path + 'best_ckpt.pth',map_location=torch.device(device))
    model.fc = nn.Identity()
    model.load_state_dict(pretext_model, strict=True)
elif feature_extraction_method == 'uni1':
    model = timm.create_model("vit_large_patch16_224",img_size = 224, patch_size=16, init_values=1e-5, num_classes=0, dynamic_img_size=True) # img_size=224, patch_size=16, 
    model.load_state_dict(torch.load(os.path.join(model_path, "vit_large_patch16_224.dinov2.uni_mass100k.bin"), map_location="cpu"), strict=True)
elif feature_extraction_method == 'uni2':
    timm_kwargs = {
       'model_name': 'vit_giant_patch14_224',
       'img_size': 224, 
       'patch_size': 14, 
       'depth': 24,
       'num_heads': 24,
       'init_values': 1e-5, 
       'embed_dim': 1536,
       'mlp_ratio': 2.66667*2,
       'num_classes': 0, 
       'no_embed_class': True,
       'mlp_layer': timm.layers.SwiGLUPacked, 
       'act_layer': torch.nn.SiLU, 
       'reg_tokens': 8, 
       'dynamic_img_size': True
      }
    model = timm.create_model(**timm_kwargs)
    model.load_state_dict(torch.load(os.path.join(model_path, "uni2-h.bin"), map_location="cpu"), strict=True)

In [6]:
############################################################################################################
#For each patient tile, get representation
############################################################################################################
ct = 0 
for cur_id in selected_ids:
    print(cur_id)

    if ct % 10 == 0: print(ct)

    if 'OPX' in cur_id:
        _file = wsi_location_opx + cur_id + ".tif"
    elif '(2017-0133)' in cur_id:
        _file = wsi_location_ccola + cur_id + '.svs'
    elif 'TMA' in cur_id:
        _file = wsi_location_tan + cur_id + '.tif'

    
    save_name = str(Path(os.path.basename(_file)).with_suffix(''))
    
    if cohort_name == "OPX":
        #Load slide
        oslide = openslide.OpenSlide(_file)

        #Get tile info
        cur_tile_info_df = tile_info_df.loc[tile_info_df['SAMPLE_ID'] == cur_id]
        
        #Generate tiles
        tiles, tile_lvls, physSize, base_mag = generate_deepzoom_tiles(oslide,save_image_size, pixel_overlap, limit_bounds)        
        
        #Grab tile 
        tile_img = get_tile_representation(cur_tile_info_df, tiles, tile_lvls, model, resize_transform)
        

        
    elif cohort_name == "TAN_TMA_Cores":      
        #Load slide
        tma = PIL.Image.open(_file)
        
        #Get tile info
        cur_tile_info_df = tile_info_df.loc[tile_info_df['SAMPLE_ID'] == cur_id]
        
        #Grab tile 
        tile_img = get_tile_representation_tma(cur_tile_info_df,tma, model, resize_transform)



    
    #Get feature
    start_time = time.time()
    feature_list = [tile_img[i][1] for i in range(cur_tile_info_df.shape[0])]
    print("--- %s seconds ---" % (time.time() - start_time))
    feature_df = np.concatenate(feature_list)
    feature_df = pd.DataFrame(feature_df)
                
    
    
    save_location = out_location + cur_id + '/features/'
    create_dir_if_not_exists(save_location)
    save_name = save_location + 'features_alltiles_' + feature_extraction_method + '.h5'
    feature_df.to_hdf(save_name, key='feature', mode='w')
    cur_tile_info_df.to_hdf(save_name, key='tile_info', mode='a')

    ct += 1



TMA97A-1-11
0
--- 3.3711657524108887 seconds ---
Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/4_tile_feature/TAN_TMA_Cores/IMSIZE250_OL0/TMA97A-1-11/features/' already exists.
TMA97A-1-12
--- 3.1593401432037354 seconds ---
Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/4_tile_feature/TAN_TMA_Cores/IMSIZE250_OL0/TMA97A-1-12/features/' already exists.
TMA97A-1-13
--- 4.9950339794158936 seconds ---
Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/4_tile_feature/TAN_TMA_Cores/IMSIZE250_OL0/TMA97A-1-13/features/' already exists.
TMA97A-1-2
--- 13.836332321166992 seconds ---
Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/4_tile_feature/TAN_TMA_Cores/IMSIZE250_OL0/TMA97A-1-2/features/' already exists.
TMA97A-1-3
--- 11.709094047546387 seconds ---
Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/4_tile_feature/TAN_TMA_Cores/IMSIZE250_OL0/TMA97A-1-3/feat

KeyboardInterrupt: 