In [1]:
#NOTE: use paimg9 env
import sys
import os
import numpy as np
import openslide
import pandas as pd
import warnings
import torch
import torch.nn as nn

sys.path.insert(0, '../Utils/')
from Utils import create_dir_if_not_exists, count_label, set_seed
from train_utils import ModelReadyData_diffdim
warnings.filterwarnings("ignore")

In [2]:
def combine_feature_label_tumorinfo_tma(patient_id, feature_path, tumor_info_path, input_file_name, selected_labels):

    #Input dir
    input_dir = feature_path + patient_id + '/' + 'features/' + input_file_name + '.h5'

    #feature
    feature_df = pd.read_hdf(input_dir, key='feature')
    feature_df.columns = feature_df.columns.astype(str)
    
    #Label
    label_df = pd.read_hdf(input_dir, key='tile_info')
    label_df.reset_index(drop = True, inplace = True)
    #add lacking labels as nan to fit the input format in the model
    labels_notintma = [x for x in selected_labels if x not in label_df.columns]
    for label in labels_notintma:
        label_df[label] = np.nan
    
    #Add tumor info to label
    tumor_info_df = pd.read_csv(os.path.join(tumor_info_path, patient_id, "ft_model/", patient_id + "_TILE_TUMOR_PERC.csv"))
    tumor_info_df.reset_index(drop = True, inplace = True)
    label_df = label_df.merge(tumor_info_df, on = ['SAMPLE_ID', 'MAG_EXTRACT', 'SAVE_IMAGE_SIZE', 'PIXEL_OVERLAP',
                                                   'LIMIT_BOUNDS', 'TILE_XY_INDEXES', 'TILE_COOR_ATLV0', 'WHITE_SPACE',
                                                   'TISSUE_COVERAGE'])
    
    #Combine feature and label and tumor info
    comb_df = pd.concat([feature_df,label_df], axis = 1)

    return comb_df

def extract_feature_label_tumorinfo_np_tma(selected_df, selected_feature, selected_labels):
    #Extract feature, label and tumor info
    feature_np = selected_df[selected_feature].values #np array
    label_np   = selected_df[selected_labels].drop_duplicates().values.astype('float32') #numpy array
    info_np    = selected_df[['SAMPLE_ID', 'MAG_EXTRACT', 'SAVE_IMAGE_SIZE', 'PIXEL_OVERLAP', 'LIMIT_BOUNDS', 
                               'TILE_XY_INDEXES', 'TILE_COOR_ATLV0', 'WHITE_SPACE', 'TISSUE_COVERAGE', 'SITE_LOCAL', 'pred_map_location', 'TUMOR_PIXEL_PERC']]
    tf_info_np = selected_df['TUMOR_PIXEL_PERC'].values

    return feature_np, label_np, info_np, tf_info_np


def get_feature_label_array_dynamic_tma(feature_path, tumor_info_path, feature_name, selected_ids,selected_labels, selected_feature, tumor_fraction_thres = 0):
    
    feature_list = []
    label_list = []
    info_list = []
    tumor_info_list = []
    id_list = []
    ct = 0 
    for pt in selected_ids:
        if ct % 100 == 0 : print(ct)
    
        #Combined feature label and tumor info
        cur_comb_df = combine_feature_label_tumorinfo_tma(pt, feature_path, tumor_info_path, feature_name, selected_labels)
        
        #Select tumor fraction > X tiles
        cur_comb_df_tumor = cur_comb_df.loc[cur_comb_df['TUMOR_PIXEL_PERC'] > tumor_fraction_thres].copy()
        cur_comb_df_tumor = cur_comb_df_tumor.sort_values(by = ['TUMOR_PIXEL_PERC'], ascending = False) 
        cur_n_tumor_tiles = cur_comb_df_tumor.shape[0] # N of tumor tiles
    
        if tumor_fraction_thres == 0: #select all tiles
            cur_selected_df =  cur_comb_df 
        elif tumor_fraction_thres > 0: #select tumor tiles based on the threshold
            cur_selected_df =  cur_comb_df_tumor 
        cur_selected_df = cur_selected_df.reset_index(drop = True)
    
        if cur_selected_df is not None :
            #Extract feature, label and tumor info
            cur_feature, cur_label, cur_info, cur_tf_info =  extract_feature_label_tumorinfo_np_tma(cur_selected_df, selected_feature, selected_labels)
            feature_list.append(cur_feature)
            label_list.append(cur_label)
            info_list.append(cur_info)
            tumor_info_list.append(cur_tf_info)
            id_list.append(pt)
            ct += 1
        
    return feature_list, label_list, info_list, tumor_info_list, id_list

In [3]:
####################################
######      USERINPUT       ########
####################################
SELECTED_LABEL = ["AR","MMR (MSH2, MSH6, PMS2, MLH1, MSH3, MLH3, EPCAM)2","PTEN","RB1","TP53","TMB_HIGHorINTERMEDITATE","MSI_POS"]
SELECTED_FEATURE = [str(i) for i in range(0,2048)] + ['TUMOR_PIXEL_PERC']
TUMOR_FRAC_THRES = 0.0
pixel_overlap = 0     
save_image_size = 250
cohort_name = "TAN_TMA_Cores"  #TAN_TMA_Cores
feature_extraction_method = 'retccl'
folder_name = cohort_name + "/" + "IMSIZE" + str(save_image_size) + "_OL" + str(pixel_overlap) + "/" 

##################
###### DIR  ######
##################
proj_dir = '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/'
feature_path = proj_dir + 'intermediate_data/4_tile_feature/' + folder_name
tumor_info_path =  proj_dir + 'intermediate_data/2_cancer_detection/' + folder_name
################################################
#Create output dir
################################################
outdir =  os.path.join(proj_dir + 'intermediate_data/5_model_ready_data', 
                       folder_name, 
                       'feature_' + feature_extraction_method, 
                       'TFT' + str(TUMOR_FRAC_THRES))
create_dir_if_not_exists(outdir)

##################
#Select GPU
##################
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
set_seed(0)

Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/5_model_ready_data/TAN_TMA_Cores/IMSIZE250_OL0/feature_retccl/TFT0.0' already exists.
cuda:0


In [4]:
tma_ids = os.listdir(feature_path)
len(tma_ids)

677

In [5]:
############################################################################################################
#Get features and labels
#NOTE: OPX_005 has no tumor tiles in fold0 train, so excluded in this step
############################################################################################################
feature_name = 'features_alltiles_' +  feature_extraction_method 
feature, label, info, tf_info, select_val_ids = get_feature_label_array_dynamic_tma(feature_path, 
                                                                                    tumor_info_path, 
                                                                                    feature_name, 
                                                                                    tma_ids,
                                                                                    SELECTED_LABEL, 
                                                                                    SELECTED_FEATURE, 
                                                                                    TUMOR_FRAC_THRES)

0
100
200
300
400
500
600


In [6]:
torch.save(feature,   outdir + '/tma_feature.pth')
torch.save(label,   outdir + '/tma_label.pth')
torch.save(info,   outdir + '/tma_info_.pth')
torch.save(tf_info,   outdir + '/tma_tfinfo.pth')
torch.save(select_val_ids,   outdir + '/tma_ids.pth')

In [7]:
############################################################################################################
#Count Distribution
############################################################################################################
val_counts = count_label(label, SELECTED_LABEL, "TMA")
val_counts
val_counts.to_csv(outdir + '/tma_counts.csv')

In [8]:
#Get model ready data
val_data = ModelReadyData_diffdim(feature,label,tf_info,include_tumor_fraction = False, include_cluster = False)
torch.save(val_data, outdir + '/tma_data.pth')