In [1]:
import pandas as pd
import openpyxl
import h5py
import cv2
import numpy as np
import zarr
import os
import sys
sys.path.append('C:/Users/w37262do/Documents/git/PyIR/src')
import spectral_preprocessing as sp
from matplotlib import pyplot as plt

## Define utility functions

In [2]:
def load_from_master(sheet_name):
    master = pd.read_excel(sheet_name)
    master = master[master['QCL_grid_number'] != -1]

    t_or_n = master['T or N'].to_numpy()
    slides = master['Slide'].to_numpy()
    slides_ftir = master['Slide'].to_numpy().copy();
    slides_ftir[slides_ftir == '2a'] = '2';
    slides_ftir[slides_ftir == '2b'] = '2'
    p_ids = master['Patient ID'].to_numpy()
    c_ids_ftir = master['Core Id'].to_numpy()
    c_ids_qcl = master['QCL_grid_number'].to_numpy()
    path_infos = master['path_info'].to_numpy().astype(str)
    ass = master['Ass']

    coords = master['Core Coor'].to_numpy()
    he_base = r'D:/PcrUK/H_E_segmented'
    he_filepaths = np.array([f'{he_base}/Slide {j}/{i[0]}-{i[1:]}' for i, j in zip(coords, slides)])

    hdf5_filepaths = master['File_loc_QCL'].to_numpy()
    hdf5_base = r'D:/PcrUK/QCL_data'
    old_base = r"E:\Bruker Data"
    hdf5_filepaths = np.array(
        [f'{i.replace(old_base, hdf5_base)}/zarr_data/core {j}/' for i, j in zip(hdf5_filepaths, c_ids_qcl)])

    annot_filepaths = np.array([f'{i.split("Isolated_Cores")[0]}annotations/Slide {j}/grid {k} anno.png' for i, j, k in
                                zip(hdf5_filepaths, slides, c_ids_qcl)])

    return he_filepaths, hdf5_filepaths, t_or_n, slides, p_ids, c_ids_qcl, path_infos, annot_filepaths
    
def process_core_from_path(filepath, transform):
    imported_data = zarr.open(filepath)
    core = imported_data['qcl_data'][:]
    wavenumbers = imported_data['wavenumbers'][:][np.newaxis,:]
    shape = np.array(core.shape[:-1])
    mask = np.ones(shape)
    core = core.reshape(np.prod(shape), -1); mask = mask.flatten()
    core, wavenumbers, mask = transform(core, wavenumbers, mask)
    core = core.reshape(*shape, -1); mask = mask.reshape(*shape)

    return core, wavenumbers, mask, shape

## Define dataset parameters

In [3]:
target_width = target_height = tw = th = 360

#sheet_path = r'../lbp_3d/data/master_linked.xlsx'
sheet_path = r'D:/PcrUK/QCL_data/annotations/annotation_directory_QCL.xlsx'
he_filepaths, hdf5_filepaths, t_or_n, slides, p_ids, c_ids, path_infos, annot_filepaths = load_from_master(sheet_path)
print(f"Dataset size: {len(he_filepaths)} items")

# Define preprocessing transform
tclass = sp.QCL2024()
transform = tclass.trans_func
print(f"Using transformation: {tclass.name}")
annotation_class_colors = np.array([[0,255,0],[128,0,128],[255,0,255],[0,0,255],[255,165,0],[255,0,0],[0,255,255],[255,255,0],])#[127,0,0]])
annotation_class_names = np.array(['epithelium_n','stroma_n','epithelium_c','stroma_c','corpora_amylacea','blood',"crushed","immune_infiltrate"])#,

Dataset size: 230 items
Using transformation: QCL2024


## Creation loop

In [4]:
root_dir = r'D:/datasets/pcuk2023_qcl_whole_core'

new_sheet_data = {
    'slide':[],
    'core_id':[],
    'patient_id':[],
    't_or_n':[],
    'pathology_info':[],
    'annotated_pixels':[],
    'hdf5_filepath':[],
    'annotation_filepath':[],
    'chemical_image_filepath':[],
    'mask_filepath':[],
}

#print(f"Processing cores, on core: ",end="\r")
for core_idx in range(0,len(hdf5_filepaths)):
    # get data from master sheet
    hdf5_path = hdf5_filepaths[core_idx]
    print(f"Processing cores, on core: {core_idx+1}/{len(hdf5_filepaths)}, {hdf5_path.split('/')[-1]}",end="")
    annot_path = annot_filepaths[core_idx]
    chemical_path = annot_filepaths[core_idx].replace(" annotations"," chemical")
    tn = t_or_n[core_idx]
    pathology = path_infos[core_idx]
    s = slides[core_idx]
    p = p_ids[core_idx]
    c = c_ids[core_idx]
    
    # Load data from disk
    core,wavenumbers,mask,shape = process_core_from_path(hdf5_path,transform=transform)
    annotations = cv2.imread(annot_path)
    chemical = cv2.imread(chemical_path)
    if core is None or annotations is None or chemical is None:
        print(" skipping, missing chemical img or annotations")
        continue
    chemical = chemical[:,:,1]
    annotations = annotations[:,:,::-1]
    
    annotation_mask = np.zeros((*shape,6))
    for tissue_class in range(6):
        annotation_mask[:,:,tissue_class] = np.all(annotations == annotation_class_colors[tissue_class].reshape(1,1,-1),axis=-1)
    if annotation_mask.sum() == 0:
        print(" skipping, no annotated pixels")
        continue
    else:
        annotated_pixels = annotation_mask.sum()
        
    # pad/transform data  
    core_height,core_width,_ = core.shape
    if core_height >360:
        core = core[core_height-360:]
        chemical = chemical[core_height-360:]
        annotations = annotations[core_height-360:]
        mask = mask[core_height-360:]
    if core_width >360:
        core = core[:,core_width-360:]
        chemical = chemical[:,core_width-360:]
        annotations = annotations[:,core_width-360:]
        mask = mask[:,core_width-360:]
    core_height,core_width,_ = core.shape
    top_pad = (target_height - core_height)//2
    bot_pad = (target_height - core_height)//2 + (target_height - core_height)%2
    left_pad = (target_width - core_width)//2
    right_pad = (target_width - core_width)//2 + (target_width - core_width)%2
    core = np.pad(core,((top_pad,bot_pad),(left_pad,right_pad),(0,0)),'edge')
    mask = np.pad(mask,((top_pad,bot_pad),(left_pad,right_pad)),'constant',constant_values=0)
    annotations = np.pad(annotations,((top_pad,bot_pad),(left_pad,right_pad),(0,0)),'constant',constant_values=0)
    chemical = np.pad(chemical,((top_pad,bot_pad),(left_pad,right_pad)),'constant',constant_values=0)
    
    # save data to disk
    spectral_savepath = f'{root_dir}/spectral/s{s:0>{2}}_c{c:0>{3}}.h5'
    chemical_savepath = f'{root_dir}/chemical/s{s:0>{2}}_c{c:0>{3}}.png'
    annotation_savepath =f'{root_dir}/annotation/s{s:0>{2}}_c{c:0>{3}}.png'
    mask_savepath =f'{root_dir}/mask/s{s:0>{2}}_c{c:0>{3}}.png'
    
    hf = h5py.File(spectral_savepath, 'w')
    #hf.create_dataset('spectra', data=core, compression='lzf')
    hf.create_dataset('spectra', data=core.astype(np.float32), compression='lzf',chunks=(13,13,193))
    hf.create_dataset('mask', data=mask.astype(np.float32), compression='lzf',chunks=(13,13))
    hf.close()
    cv2.imwrite(chemical_savepath, chemical.astype(np.uint8))
    cv2.imwrite(annotation_savepath, annotations.astype(np.uint8)[:,:,::-1])
    cv2.imwrite(mask_savepath, mask.astype(np.uint8)*255)
    
    # append to mega lists
    new_sheet_data['hdf5_filepath'].append(spectral_savepath)
    new_sheet_data['annotation_filepath'].append(annotation_savepath)
    new_sheet_data['chemical_image_filepath'].append(chemical_savepath)
    new_sheet_data['mask_filepath'].append(mask_savepath)
    new_sheet_data['annotated_pixels'].append(int(annotated_pixels))
    new_sheet_data['t_or_n'].append(tn)
    new_sheet_data['pathology_info'].append(pathology)
    new_sheet_data['slide'].append(s)
    new_sheet_data['patient_id'].append(p)
    new_sheet_data['core_id'].append(c)    
    print("")
print("\n")

Processing cores, on core: 1/230, 
Processing cores, on core: 2/230, 
Processing cores, on core: 3/230,  skipping, no annotated pixels
Processing cores, on core: 4/230, 
Processing cores, on core: 5/230, 
Processing cores, on core: 6/230, 
Processing cores, on core: 7/230, 
Processing cores, on core: 8/230, 
Processing cores, on core: 9/230, 
Processing cores, on core: 10/230, 
Processing cores, on core: 11/230, 
Processing cores, on core: 12/230, 
Processing cores, on core: 13/230, 
Processing cores, on core: 14/230, 
Processing cores, on core: 15/230, 
Processing cores, on core: 16/230, 
Processing cores, on core: 17/230, 
Processing cores, on core: 18/230, 
Processing cores, on core: 19/230, 
Processing cores, on core: 20/230, 
Processing cores, on core: 21/230, 
Processing cores, on core: 22/230, 
Processing cores, on core: 23/230, 
Processing cores, on core: 24/230,  skipping, no annotated pixels
Processing cores, on core: 25/230, 
Processing cores, on core: 26/230, 
Processing co

In [5]:
df = pd.DataFrame.from_dict(new_sheet_data)
df.to_excel(fr"{root_dir}/master_sheet.xlsx",index=False)

In [6]:
np.save(fr"{root_dir}/mean_spectrum.npy",core[mask==1].mean(axis=(0)),)
np.save(fr"{root_dir}/std_spectrum.npy",core[mask==1].std(axis=(0)),)
np.save(fr"{root_dir}/wavenumbers.npy",wavenumbers,)