In [1]:
# ENV: paimg9
import sys
import os
import numpy as np
import cv2
import openslide
from openslide import open_slide
from openslide.deepzoom import DeepZoomGenerator
import xml.etree.ElementTree as ET
from xml.dom import minidom
import geojson
import argparse
import matplotlib.pyplot as plt
import fastai
from fastai.vision.all import *
import PIL
matplotlib.use('Agg')
import pandas as pd
import datetime
from skimage import draw, measure, morphology, filters
from shapely.geometry import Polygon, Point, MultiPoint, MultiPolygon, shape
from shapely.ops import cascaded_union, unary_union
import json
import shapely
import warnings
from scipy import ndimage
sys.path.insert(0, '../Utils/')
from Utils import generate_deepzoom_tiles, extract_tile_start_end_coords
from Utils import do_mask_original,check_tissue,whitespace_check
from Utils import slide_ROIS
from Utils import get_downsample_factor, get_image_at_target_mag
from Utils import create_dir_if_not_exists
from Utils import get_map_startend
from Utils import cancer_mask_fix_res, tile_ROIS, check_any_invalid_poly, make_valid_poly
from Utils import convert_img
from Utils import plot_tiles_with_topK_cancerprob, get_binary_pred_tile
from Utils import cancer_inference_wsi , cancer_inference_tma
warnings.filterwarnings("ignore")

In [2]:
############################################################################################################
#USER INPUT 
############################################################################################################
mag_extract = 20        # do not change this, model trained at 250x250 at 20x
save_image_size = 250   # do not change this, model trained at 250x250 at 20x
pixel_overlap = 0       # specify the level of pixel overlap in your saved images
limit_bounds = True     # this is weird, dont change it
smooth = True           # whether or not to gaussian smooth the output probability map
ft_model = True         # whether or not to use fine-tuned model
mag_target_prob = 2.5   # 2.5x for probality maps
mag_target_tiss = 1.25   #1.25x for tissue detection, this is not used for TMA
bi_thres = 0.4           #Binary classification threshold for cancer mask
cohort_name = "TCGA_PRAD"

############################################################################################################
#DIR
############################################################################################################
proj_dir = '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/'
wsi_location_ccola = proj_dir + '/data/CCola/all_slides/'
wsi_location_opx = proj_dir + '/data/OPX/'
wsi_location_tan = proj_dir + 'data/TAN_TMA_Cores/'
wsi_location_tcga = proj_dir + 'data/TCGA_PRAD/'
feature_location = proj_dir + 'intermediate_data/1_tile_pulling/'+ cohort_name + "/" + "IMSIZE" + str(save_image_size) + "_OL" + str(pixel_overlap) + "/" #cancer_prediction_results110224
model_path = proj_dir + 'models/cancer_detection_models/mets/'

out_location = proj_dir + 'intermediate_data/2_cancer_detection/'+ cohort_name + "/" + "IMSIZE" + str(save_image_size) + "_OL" + str(pixel_overlap) + "/"
create_dir_if_not_exists(out_location)

Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/2_cancer_detection/TCGA_PRAD/IMSIZE250_OL0/' already exists.


In [3]:
############################################################################################################
#Select IDS
############################################################################################################
#Get IDs that are in FT train or already processed to exclude 
fine_tune_ids_df = pd.read_csv(proj_dir + 'intermediate_data/0_cd_finetune/cancer_detection_training/all_tumor_fraction_info.csv')
ft_train_ids = list(fine_tune_ids_df.loc[fine_tune_ids_df['Train_OR_Test'] == 'Train','sample_id'])
toexclude_ids = ft_train_ids + ['cca3af0c-3e0e-4cfb-bb07-459c979a0bd5'] #The latter one is TCGA issue file

#All available IDs
opx_ids = [x.replace('.tif','') for x in os.listdir(wsi_location_opx)] #217
ccola_ids = [x.replace('.svs','') for x in os.listdir(wsi_location_ccola) if '(2017-0133)' in x] #234
tan_ids =  [x.replace('.tif','') for x in os.listdir(wsi_location_tan)] #677
tcga_ids = [x.replace('.svs','') for x in os.listdir(wsi_location_tcga) if x != '.DS_Store'] #449

if cohort_name == "OPX":
    all_ids = opx_ids
elif cohort_name == "ccola":
    all_ids = ccola_ids
elif cohort_name == "TAN_TMA_Cores":
    all_ids = tan_ids
elif cohort_name == 'TCGA_PRAD':
    all_ids = tcga_ids
elif cohort_name == "all":
    all_ids = opx_ids + ccola_ids + tan_ids + tcga_ids
    
#Exclude ids in ft_train or processed
selected_ids = [x for x in all_ids if x not in toexclude_ids]
selected_ids.sort()

In [4]:
############################################################################################################
#START
############################################################################################################
for cur_id in selected_ids:

    save_location = out_location + "/" + cur_id + "/" 
    create_dir_if_not_exists(save_location)

    if 'OPX' in cur_id:
        _file = wsi_location_opx + cur_id + ".tif"
        rad_tissue = 5
    elif '(2017-0133)' in cur_id:
        _file = wsi_location_ccola + cur_id + '.svs'
        rad_tissue = 2
    elif 'TMA' in cur_id:
        _file = wsi_location_tan + cur_id + '.tif'
        rad_tissue = 2
    else:
        slides_name = [f for f in os.listdir(wsi_location_tcga + cur_id + '/') if '.svs' in f][0].replace('.svs','')
        _file = wsi_location_tcga + cur_id + '/' + slides_name + '.svs'
        rad_tissue = 2


    #Load model   
    if ft_model == True:
        learn = load_learner(model_path + 'ft_models/dlv3_2ep_2e4_update-07182023_RT_fine_tuned..pkl',cpu=False) #all use mets model
        save_location = save_location + "ft_model" + "/"
        create_dir_if_not_exists(save_location)
    else:
        learn = load_learner(model_path + 'dlv3_2ep_2e4_update-07182023_RT.pkl',cpu=False) #all use prior mets model
        save_location = save_location + "prior_model" + "/"
        create_dir_if_not_exists(save_location)

    #Check if already processed
    if os.path.exists(save_location + "ft_model" + "/") == False:
        
        #Load tile info 
        if cohort_name == 'TCGA_PRAD':
            tile_info_df = pd.read_csv(feature_location + cur_id + "/"  + slides_name + "_tiles.csv")
        else:
            tile_info_df = pd.read_csv(feature_location + cur_id + "/"  + cur_id + "_tiles.csv")
        print(tile_info_df.shape)
        
        #Run
        if 'TMA' in cur_id:
            cancer_inference_tma(_file, learn, tile_info_df, save_image_size, pixel_overlap, mag_target_prob, rad_tissue, smooth, bi_thres, save_location, save_name = cur_id)
        else:
            cancer_inference_wsi(_file, learn, tile_info_df, mag_extract, save_image_size, pixel_overlap, limit_bounds, mag_target_prob, mag_target_tiss, rad_tissue, smooth, bi_thres, save_location, save_name = slides_name)


Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/2_cancer_detection/TCGA_PRAD/IMSIZE250_OL0//00784afd-6fc2-4f5e-b07e-0ebb38152339/' already exists.
Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/2_cancer_detection/TCGA_PRAD/IMSIZE250_OL0//00784afd-6fc2-4f5e-b07e-0ebb38152339/ft_model/' already exists.
(20298, 9)
True
starting inference
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
post-processing
Cancer Prob generation
Get cancer binary mask...
Output json annotation...
Plot top predicted  tiles...
Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/2_cancer_detection/TCGA_PRAD/IMSIZE250_OL0//0138c086-0d7e-4fce-a1e9-8c272173e5b1/' created.
Directory '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/intermediate_data/2_cancer_detection/T

KeyboardInterrupt: 