In [1]:
#!/usr/bin/env python
# coding: utf-8
#NOTE: use paimg9 env

import sys
import os
import numpy as np
import openslide
from fastai.vision.all import *
matplotlib.use('Agg')
import pandas as pd
import warnings
sys.path.insert(0, '../Utils/')
from Preprocessing import preprocess_mutation_data, preprocess_site_data
from Utils import create_dir_if_not_exists
warnings.filterwarnings("ignore")

In [2]:
############################################################################################################
#USER INPUT 
############################################################################################################
pixel_overlap = 100      # specify the level of pixel overlap in your saved images
save_image_size = 250
cohort_name = "OPX"  #TAN_TMA_Cores
feature_extraction_method = 'retccl'
folder_name = cohort_name + "/" + "IMSIZE" + str(save_image_size) + "_OL" + str(pixel_overlap) + "/" 

############################################################################################################
#DIR
############################################################################################################
proj_dir = '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/'
wsi_location_opx = proj_dir + '/data/OPX/'
wsi_location_tan = proj_dir + 'data/TAN_TMA_Cores/'
wsi_location_ccola = proj_dir + '/data/CCola/all_slides/'
label_path = proj_dir + 'data/MutationCalls/' + cohort_name + '/'
tile_info_path =  proj_dir + 'intermediate_data/1_tile_pulling_and_tissue_detect/' + folder_name #Old in cancer_prediction_results110224
out_location = proj_dir + 'intermediate_data/3_updated_tile_info/'+ folder_name
create_dir_if_not_exists(out_location)

##################
#Select GPU
##################
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/3_updated_tile_info/OPX/IMSIZE250_OL100/' already exists.


In [3]:
############################################################################################################
#Select IDS
############################################################################################################
#Get IDs that are in FT train or already processed to exclude 
fine_tune_ids_df = pd.read_csv(proj_dir + 'intermediate_data/0_cd_finetune/cancer_detection_training/all_tumor_fraction_info.csv')
ft_train_ids = list(fine_tune_ids_df.loc[fine_tune_ids_df['Train_OR_Test'] == 'Train','sample_id']) #24, 7 from OPX, 17 from ccola
toexclude_ids = ft_train_ids 

#All available IDs
opx_ids = [x.replace('.tif','') for x in os.listdir(wsi_location_opx)] #217
ccola_ids = [x.replace('.svs','') for x in os.listdir(wsi_location_ccola) if '(2017-0133)' in x] #234
tan_ids =  [x.replace('.tif','') for x in os.listdir(wsi_location_tan)] #677

if cohort_name == "OPX":
    all_ids = opx_ids
elif cohort_name == "ccola":
    all_ids = ccola_ids
elif cohort_name == "TAN_TMA_Cores":
    all_ids = tan_ids
elif cohort_name == "all":
    all_ids = opx_ids + ccola_ids + tan_ids

#Exclude ids in ft_train or processed
selected_ids = [x for x in all_ids if x not in toexclude_ids] #209 for 
selected_ids.sort()

In [4]:
if cohort_name == "OPX":
    ################################################
    #Load OPX mutation label data
    ################################################
    #Old Data
    label_df1 = pd.read_excel(label_path + "OPX_FH_original.xlsx")
    
    #newly added data
    label_df2 = pd.read_excel(label_path + "MMR_OPX_deidentified.xlsx")
    label_df2.rename(columns = {'HR/DDR (BRCA1, BRCA2, ATM, CHEK2, PALB2, BAP1, BARD1, RAD51C, RAD51D, FANCA, FANCD2, MRE11A, ATR, NBN, FANCM, FANCG)': 
                             'MMR (MSH2, MSH6, PMS2, MLH1, MSH3, MLH3, EPCAM)2'}, inplace = True)
    label_df2 = label_df2.loc[pd.isna(label_df2['OPX_Number']) == False] #remove NA
    label_df2 = label_df2[label_df1.columns] #only keep the same columns as old data
    
    #Combined
    label_df = pd.concat([label_df1, label_df2])
    label_df = preprocess_mutation_data(label_df)
    label_df.reset_index(drop=True, inplace=True)
    
    ################################################
    #Load OPX Site data
    ################################################
    #Old data, #New data has no site info
    site_df = pd.read_excel(label_path + "OPX_anatomic sites.xlsx")
    site_df.reset_index(drop=True, inplace=True)
    site_df = preprocess_site_data(site_df)

    ############################################################################################################
    #Add site and label info into tile info
    ############################################################################################################
    tile_info_list = []
    for cur_id in selected_ids:
        cur_tile_info_df = pd.read_csv(os.path.join(tile_info_path, cur_id, cur_id + "_tiles.csv"))
        cur_comb_df = cur_tile_info_df.merge(label_df, on = ['SAMPLE_ID'],how = 'left') #add label
        cur_comb_df = cur_comb_df.merge(site_df, on = ['SAMPLE_ID'], how = 'left') #add site
        tile_info_list.append(cur_comb_df)
    all_tile_info_df = pd.concat(tile_info_list)
    print(all_tile_info_df.shape) #1308050 tiles overlap0, 3633199 tiles overlap100
    
    #Print stats
    tile_counts = all_tile_info_df['SAMPLE_ID'].value_counts()
    print("Total OPX IDs in tile path: ", len(set(all_tile_info_df['SAMPLE_ID']))) #3375102 tiles in total
    print("Max # tile/per pt:", tile_counts.max()) #34689
    print("Min # tile/per pt:", tile_counts.min()) #43
    print("Median # tile/per pt:", tile_counts.median()) #1570.5

elif cohort_name == 'TAN_TMA_Cores':
    ################################################
    #Load TAN_TMA mutation label data
    ################################################
    label_df1 = pd.read_excel(label_path + "TAN97_core_mappings.xlsx") #These Ids not in label_df2: ['18-018', '18-087', '18-064', '18-077', '08-016', '06-131']
    label_df1.rename(columns = {'AR': 'AR_inMappingFile'}, inplace = True)
    label_df1.loc[pd.isna(label_df1['AR pos']),'AR pos'] = 0
    label_df1.loc[pd.isna(label_df1['NE pos']),'NE pos'] = 0
    
    label_df2 = pd.read_excel(label_path + "TAN_coded mutation_for Roman.xlsx") 
    #Rename as OPX annotation 
    label_df2.rename(columns = {'AR coded': 'AR',
                               'CHD1 coded': 'CHD1',
                               'PTEN coded': 'PTEN',
                               'RB1 coded': 'RB1',
                               'TP53 coded': 'TP53', 
                               'BRCA2 coded':'BRCA2'}, inplace = True)
    
    
    #Combine
    #Only keep the ids in TAN_coded mutation_for Roman.xlsx, because no mutation labels are aviabale , cannot say it is negative
    label_df = label_df1.merge(label_df2, left_on = ['ptid'], right_on = ['Sample'], how = 'right')
    label_df.reset_index(drop=True, inplace=True)
    
    #There 40 sample IDs does not have matched AR status
    checkAR = label_df.loc[label_df['AR pos'] != label_df['AR'],]
    print(len(set(checkAR['Sample'])))
    checkAR.to_csv(out_location + "AR_notmatch.csv", index = False)
    
    
    #Recode SITE info
    label_df['SITE_LOCAL'] = pd.NA
    cond = label_df['ORGAN SITE'] == 'PROSTATE'
    label_df.loc[cond,'SITE_LOCAL'] = 1
    label_df.loc[~cond,'SITE_LOCAL'] = 0
    
    label_df.rename(columns = {'TMA-row-col': 'SAMPLE_ID'}, inplace= True)
    
    ############################################################################################################
    #Add site and label info into tile info
    ############################################################################################################
    tile_info_list = []
    for cur_id in selected_ids:
        cur_tile_info_df = pd.read_csv(os.path.join(tile_info_path, cur_id, cur_id + "_tiles.csv"))
        cur_comb_df = cur_tile_info_df.merge(label_df, on = ['SAMPLE_ID'],how = 'left') #add label
        tile_info_list.append(cur_comb_df)
    all_tile_info_df = pd.concat(tile_info_list)
    print(all_tile_info_df.shape) #146888 tiles overlap0
    
    #Print stats
    tile_counts = all_tile_info_df['SAMPLE_ID'].value_counts()
    print("Total OPX IDs in tile path: ", len(set(all_tile_info_df['SAMPLE_ID']))) #3375102 tiles in total
    print("Max # tile/per pt:", tile_counts.max()) #311
    print("Min # tile/per pt:", tile_counts.min()) #5
    print("Median # tile/per pt:", tile_counts.median()) #233.0

(3633199, 38)
Total OPX IDs in tile path:  210
Max # tile/per pt: 96406
Min # tile/per pt: 108
Median # tile/per pt: 4376.5


In [5]:
#Output
all_tile_info_df.to_csv(out_location + "all_tile_info.csv", index = False)