In [1]:
#NOTE: use paimg9 env
import sys
import os
import numpy as np
import openslide
import pandas as pd
import warnings
import torch
import torch.nn as nn

sys.path.insert(0, '../Utils/')
from Utils import create_dir_if_not_exists, count_label, set_seed
from train_utils import ModelReadyData_diffdim, get_feature_label_array_dynamic
warnings.filterwarnings("ignore")

In [2]:
####################################
######      USERINPUT       ########
####################################
SELECTED_LABEL = ["AR","MMR (MSH2, MSH6, PMS2, MLH1, MSH3, MLH3, EPCAM)2","PTEN","RB1","TP53","TMB_HIGHorINTERMEDITATE","MSI_POS"]
TUMOR_FRAC_THRES = 0.9
pixel_overlap = 0    
save_image_size = 250
cohort_name = "OPX"  
feature_extraction_method = 'uni2' #retccl, uni1
folder_name = cohort_name + "/" + "IMSIZE" + str(save_image_size) + "_OL" + str(pixel_overlap) + "/" 

if feature_extraction_method == 'retccl':
    SELECTED_FEATURE = [str(i) for i in range(0,2048)] + ['TUMOR_PIXEL_PERC'] #If retccl 2048, if uni 1024
elif feature_extraction_method == 'uni1': 
    SELECTED_FEATURE = [str(i) for i in range(0,1024)] + ['TUMOR_PIXEL_PERC'] #If retccl 2048, if uni 1024
elif feature_extraction_method == 'uni2':
    SELECTED_FEATURE = [str(i) for i in range(0,1536)] + ['TUMOR_PIXEL_PERC'] #If retccl 2048, if uni 1024

##################
###### DIR  ######
##################
proj_dir = '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/'
feature_path = proj_dir + 'intermediate_data/4_tile_feature/' + folder_name
tumor_info_path =  proj_dir + 'intermediate_data/2_cancer_detection/' + folder_name

################################################
#Create output dir
################################################
outdir =  os.path.join(proj_dir + 'intermediate_data/5_model_ready_data', 
                       folder_name, 
                       'feature_' + feature_extraction_method, 
                       'TFT' + str(TUMOR_FRAC_THRES))
create_dir_if_not_exists(outdir)

##################
#Select GPU
##################
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
set_seed(0)

Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/5_model_ready_data/TAN_TMA_Cores/IMSIZE250_OL0/feature_uni2/TFT0.9' already exists.
cuda:0


In [3]:
############################################################################################################
#Select OPX IDS
############################################################################################################
#All available IDs
opx_ids = [x.replace('.tif','') for x in os.listdir(feature_path) if x != '.DS_Store'] #210 , excluded 7 fine-tune cases
opx_ids.sort()

#Check cancer detection
all_tile_info_list = list()
for pt in opx_ids:
    cur_tile_info = pd.read_hdf(feature_path + pt + '/features/' + 'features_alltiles_' +  feature_extraction_method + '.h5', key='tile_info')
    cur_tumor_info_df = pd.read_csv(os.path.join(tumor_info_path, pt, "ft_model/", pt + "_TILE_TUMOR_PERC.csv"))
    cur_tile_info = cur_tile_info.merge(cur_tumor_info_df, on = ['SAMPLE_ID', 'MAG_EXTRACT', 'SAVE_IMAGE_SIZE', 'PIXEL_OVERLAP',
                                                                 'LIMIT_BOUNDS', 'TILE_XY_INDEXES', 'TILE_COOR_ATLV0', 'WHITE_SPACE',
                                                                 'TISSUE_COVERAGE'])
    all_tile_info_list.append(cur_tile_info)

all_tile_info = pd.concat(all_tile_info_list)

all_tile_info_thres = all_tile_info.loc[all_tile_info['TUMOR_PIXEL_PERC'] >= TUMOR_FRAC_THRES]

nocancer_ids = [x for x in opx_ids if x not in list(set(all_tile_info_thres['SAMPLE_ID']))]
print("No Cancer IDs",nocancer_ids)

#Excluded 2 colon cases: OPX_085, OPX_182
toexclude_ids = nocancer_ids + ['OPX_085','OPX_182']  #25

#Exclude ids in ft_train or processed
selected_ids = [x for x in opx_ids if x not in toexclude_ids] #199
print(len(selected_ids))

No Cancer IDs ['TMA97A-1-11', 'TMA97A-1-12', 'TMA97A-1-13', 'TMA97A-1-3', 'TMA97A-1-6', 'TMA97A-2-2', 'TMA97A-2-3', 'TMA97A-2-5', 'TMA97A-3-10', 'TMA97A-4-12', 'TMA97A-4-7', 'TMA97A-5-10', 'TMA97A-5-13', 'TMA97A-5-3', 'TMA97A-5-7', 'TMA97A-6-1', 'TMA97A-6-3', 'TMA97A-6-4', 'TMA97A-6-6', 'TMA97A-7-13', 'TMA97A-7-2', 'TMA97A-7-8', 'TMA97A-8-11', 'TMA97A-8-4', 'TMA97B-1-12', 'TMA97B-1-13', 'TMA97B-1-2', 'TMA97B-1-9', 'TMA97B-2-1', 'TMA97B-3-13', 'TMA97B-3-2', 'TMA97B-3-3', 'TMA97B-3-9', 'TMA97B-4-10', 'TMA97B-4-11', 'TMA97B-5-4', 'TMA97B-6-11', 'TMA97B-6-6', 'TMA97B-6-7', 'TMA97B-7-5', 'TMA97B-7-6', 'TMA97B-7-9', 'TMA97B-8-10', 'TMA97B-8-11', 'TMA97B-8-13', 'TMA97B-9-11', 'TMA97B-9-12', 'TMA97B-9-2', 'TMA97B-9-9', 'TMA97C-1-10', 'TMA97C-1-11', 'TMA97C-1-12', 'TMA97C-1-13', 'TMA97C-1-8', 'TMA97C-1-9', 'TMA97C-2-1', 'TMA97C-2-10', 'TMA97C-2-13', 'TMA97C-2-2', 'TMA97C-2-3', 'TMA97C-2-5', 'TMA97C-2-6', 'TMA97C-2-8', 'TMA97C-2-9', 'TMA97C-3-1', 'TMA97C-3-12', 'TMA97C-3-13', 'TMA97C-3-2', 'TMA9

In [4]:
############################################################################################################
#Get features and labels
#NOTE: OPX_005 has no tumor tiles in fold0 train, so excluded in this step
############################################################################################################
feature_name = 'features_alltiles_' +  feature_extraction_method 
feature, label, info, tf_info, select_val_ids = get_feature_label_array_dynamic(feature_path,
                                                                                tumor_info_path,
                                                                                feature_name, 
                                                                                selected_ids, 
                                                                                SELECTED_LABEL,
                                                                                SELECTED_FEATURE,
                                                                                tumor_fraction_thres = TUMOR_FRAC_THRES)

0


KeyError: "['MMR (MSH2, MSH6, PMS2, MLH1, MSH3, MLH3, EPCAM)2', 'TMB_HIGHorINTERMEDITATE', 'MSI_POS'] not in index"

In [None]:
torch.save(feature,   outdir + '/OPX_feature.pth')
torch.save(label,   outdir + '/OPX_label.pth')
torch.save(info,   outdir + '/OPX_info.pth')
torch.save(tf_info,   outdir + '/OPX_tfinfo.pth')
torch.save(select_val_ids,   outdir + '/OPX_ids.pth')

In [None]:
#TODO: Double check this two, why not match
print(all_tile_info_thres.shape) #930297
check = np.concatenate(info) #927717
check.shape

In [None]:
############################################################################################################
#Count Distribution
############################################################################################################
counts = count_label(label, SELECTED_LABEL, "OPX")
print(counts)
counts.to_csv(outdir + '/OPX_counts.csv')

In [None]:
#Get model ready data
data = ModelReadyData_diffdim(feature,label,tf_info,
                                  include_tumor_fraction = False, 
                                  include_cluster = False, 
                                  feature_name =feature_extraction_method)
torch.save(data, outdir + '/OPX_data.pth')

In [None]:
data[0][0].shape