# Imaging Data on Wynton IC App Server

In [None]:
import pandas as pd
import numpy as np
import os 
import re

import matplotlib as plt
#Specify where plots are displayed to view images in the notebook
%matplotlib inline 

import pyrootutils
root = pyrootutils.setup_root(
    search_from=os.path.abspath(''),
    indicator=[".git"],
    pythonpath=True, # add root directory to the PYTHONPATH (helps with imports)
    dotenv=True, # load environment variables from .env if exists in root directory
)

from src.utils.query_utils.extractor import Extractor
from src.utils.query_utils.elasticsearch_helpers import ImagePathQuery 
from src.utils.file_management.dicom_utils import get_volume_from_directory
from src.utils.visualizers.plot_images import plot_volume 

#from pprint import pprint as pp

In [None]:
# --- Cohort selection ---
# Load file configuration
from src.utils.file_management.config_loader import load_yaml, process_config_values
from src.utils.file_management.file_manager import FileManager

cohort_cfg_path = #TODO '/path_to_your_project/code/config/datasets/cohort03_MriNoninvasive.yaml'
config = load_yaml(cohort_cfg_path)
config = process_config_values(config)
print(config.keys())

PlumsFiles = FileManager(config.get('file_directory'))

# Initialize data extraction tools
PlumsExtractor = Extractor()

# Identify Imaging ID (accessions)
df = PlumsFiles.read_file(PlumsFiles.get_datapath('accessionnumber_csv'))
accessionnumber_workinglist = df['accessionnumber'].to_list()
accessionnumber_workinglist = PlumsExtractor.remove_invalid(accessionnumber_workinglist)
print(len(accessionnumber_workinglist))

## Select best matching series (i.e. T1 sag)

In [None]:
# --- Functions ---
def select_files_by_priority(dcm_dirpath_list,series_desc_list):
    #This function takes a list of file paths sorted by priority text-matching and returns only the top options
    #Filepaths have the folder structure .../patient/date/series
    
    #Remove series folder to get
    date_dirpath_list = list(set([os.path.split(dcm_dirpath)[0] for dcm_dirpath in dcm_dirpath_list]))
    best_path_list = [None]*len(date_dirpath_list)
    best_desc_list = [None]*len(date_dirpath_list)
    desc_priority_list = []
    
    for date_row in reversed(range(len(date_dirpath_list))):
        date_dirpath = date_dirpath_list[date_row]
        desc_options=[]
        for dcm_row in range(len(dcm_dirpath_list)):
            dcm_dirpath = dcm_dirpath_list[dcm_row]
            dcm_desc = series_desc_list[dcm_row]
            if date_dirpath in dcm_dirpath:
                if re.search('gad|\+c|c\+|pre|post|si|joints|water|inphase|subtle|prop|pelv|flair|ideal|3D|propeller|csp|c-sp|c\ spine',dcm_desc.lower()): 
                    # exclude descriptions with contrast (gad, +c, c+, pre, post)
                    # Exclude: pelvis, PELV, FLAIR (imgs have different contrast), IDEAL (images diff contrast), 3D, PROPELLER
                    # CSP, C-SP, C spine, c-spine
                    continue
                
                if re.search('repeat|rpt',dcm_desc.lower()):
                    # Choose RPT or REPEAT
                    best_path_list[date_row] = dcm_dirpath
                    best_desc_list[date_row] = dcm_desc
                    
                    desc_options.insert(0, series_desc_list[dcm_row])
                    
                if best_path_list[date_row] is None:
                    best_path_list[date_row] = dcm_dirpath
                    best_desc_list[date_row] = dcm_desc
                
                    desc_options.append(series_desc_list[dcm_row])
        
        if len(desc_options)>1:
            desc_priority_list.append(desc_options)
        
    return best_path_list, best_desc_list, desc_priority_list

# Take the later scan if same name
# HRDWRE - consider keeping - check a few cases to get a sense of seg quality
# LOWER

def get_metadata_summary_for_filepaths(query_output,metadata_fields:list=[]):
    
    '''
    This function takes an input of a list of folders with dicom images and colloquial metadata field names.
    The function extracts metadata from a dicom file in each folder and returns relevant fields in a dict.
    '''
    
    #Specify key to extract metadata information
    metadata_keys = {
        'dirpath':'FilePath',
        'AccessionNumber':'AccessionNumber',
        'PID':'Patient ID',
        'ETL':'EchoTrainLength',
        'field_strength': 'MagneticFieldStrength',
        'series_desc': 'SeriesDescription',
        'study_desc': 'StudyDescription',
        'TE': 'EchoTime',
        'TR': 'RepetitionTime',
        'flip_angle': 'FlipAngle',
        'recieve_coil': 'ReceiveCoilName',
        'scanner_name': 'StationName',
        'scanner_model': 'ManufacturerModelName',
        'slice_thickness': 'SliceThickness',
        'slice_spacing': 'SpacingBetweenSlices',
        'pixel_spacing': 'PixelSpacing',
        'rows':'Rows',
        'columns':'Columns'
    }
    
    if metadata_fields==[]:
        metadata_fields = list(metadata_keys.keys())
    
    #Create a dictionary to store metadata information
    metadata_summary = dict()
    for field in metadata_fields:
        metadata_summary[field] = []
    
    
    for key,current_query in query_output:
        
        #Extract desired fields and add to list that is stored in the metadata_summary dict
        for field in metadata_fields:
            try:
                data_list = metadata_summary[field]
                data_list.extend(current_query.get_all_metadata_field(metadata_keys[field]))
                metadata_summary[field] = data_list
            except:
                data_list.append(float("nan"))
                metadata_summary[field] = data_list
    
    return metadata_summary

def and_all_match(search_str:str, search_terms:list):
        return all(s in search_str for s in search_terms)

In [None]:
# --- Main ---
#Initialize and query all images whose "SeriesDescription" contains the search terms
t1sag_query = ImagePathQuery(accessionnumber_workinglist,'t1','sag')
t1ax_query = ImagePathQuery(accessionnumber_workinglist,'t1','ax')
t2sag_query = ImagePathQuery(accessionnumber_workinglist,'t2','sag')
t2ax_query = ImagePathQuery(accessionnumber_workinglist,'t2','ax')

# --- DEBUG START ---
#test if query worked by access the metadata of the first match
#t1sag_query.get_metadata(idx=0)
# --- DEBUG END ---

'''Now we have all images with a relevant "SeriesDescription". However, there may be multiple
scans for each accession number (ex. T1 SAG vs T1 SAG FS). Note an accession number is 
associated with a patient visit. For the cases with mutiple scan series, we need (1)
select the best scan series and (2) update query variable '''
# For each query
for key,current_query in {'t1sag':t1sag_query,'t1ax':t1ax_query,'t2sag':t2sag_query,'t2ax':t2ax_query}.items():
    print(f"Number of {key} series (with repeats): {current_query.get_total_num_results()}")
    
    # Run a selection function to identify the filepath to the scan with the most relevant "SeriesDescription"
    # for each accession number
    series_desc_list = current_query.get_all_metadata_field('SeriesDescription')
    dcm_dirpath_list = current_query.get_all_metadata_field('FilePath')
    best_path_list, best_desc_list, desc_priority_list = select_files_by_priority(dcm_dirpath_list,series_desc_list)
    best_path_list = PlumsExtractor.remove_invalid(best_path_list)
    print(desc_priority_list)
    
    ## --- FOR DEBUG ---
    #sample_MRI_list_filepath = MRI_dir+'/'+key+'.csv'
    #df = pd.DataFrame()
    #df['series_desc'] = series_desc_list
    #df['dcm_directory'] = dcm_dirpath_list
    #df.to_csv(sample_MRI_list_filepath)
    
    #best_path_list = PlumsExtractor.remove_invalid(dcm_dirpath_list)
    ##--- END DEBUG ---
    
    # Update the query to only include new file selection
    print(f"Number of {key} series (with only best): {len(best_path_list)}")
    if key=='t1sag':
        t1sag_query.update_query_with_filepaths(best_path_list)
        print(f"Number of {key} series (with only best): {t1sag_query.get_total_num_results()}")
    elif key=='t1ax':
        t1ax_query.update_query_with_filepaths(best_path_list)
        print(f"Number of {key} series (with only best): {t1ax_query.get_total_num_results()}")
    elif key=='t2sag':
        t2sag_query.update_query_with_filepaths(best_path_list)
        print(f"Number of {key} series (with only best): {t2sag_query.get_total_num_results()}")
    elif key=='t2ax':
        t2ax_query.update_query_with_filepaths(best_path_list)
        print(f"Number of {key} series (with only best): {t2ax_query.get_total_num_results()}")
    print()

In [None]:
# Extract image metadata dataframe
query_output = {'t1sag':t1sag_query,'t1ax':t1ax_query,'t2sag':t2sag_query,'t2ax':t2ax_query}.items()
metadata_summary_dict = get_metadata_summary_for_filepaths(query_output,metadata_fields=[])

df_metadata = pd.DataFrame(metadata_summary_dict)
df_metadata = df_metadata.sort_values('AccessionNumber').reset_index(drop=True)
print(f"all\t number of series: {len(df_metadata)}, number of patients: {len(list(set(df_metadata['AccessionNumber'])))}")

# Add a column with the (standardized) type of scan to the metadata dataframe
scan_type = []
for series_desc in list(df_metadata['series_desc']):
    series_desc = str(series_desc)
    if and_all_match(series_desc.lower(), ['t1','sag'])==True:
        scan_type.append('t1-sag')
    elif and_all_match(series_desc.lower(), ['t1','ax'])==True:
        scan_type.append('t1-ax')
    elif and_all_match(series_desc.lower(), ['t2','sag'])==True:
        scan_type.append('t2-sag')
    elif and_all_match(series_desc.lower(), ['t2','ax'])==True:
        scan_type.append('t2-ax')
    else:
        scan_type.append('NaN')
df_metadata['scan_type'] = scan_type

# save dataframe with path to segmentation files and slices with segmentations
query_dir = config.get('query_output_dir')
save_path = query_dir +'/20240801_dcm_dirlist_t1sag_t1ax_t2sag_t2ax_all_seqs.csv'
print(save_path)
PlumsFiles.save_df_to_csv(df_metadata,save_path)
df_metadata.tail(6)

## Request Images

In [None]:
# --- Functions ---
def get_accession_list(cohort_filepath):
    try:
        cohort_df = pd.read_csv(cohort_filepath)
    except:
        cohort_df = pd.read_excel(cohort_filepath)
    try:
        accessionlist = cohort_df['accessionnumber'].tolist()
    except:
        accessionlist = cohort_df['AccessionNumber'].tolist()
    cohortlist = [x for x in accessionlist if str(x) != 'nan'] 
    return cohortlist

def remove_invalid_entries(input_list):
    return [value for value in input_list if value is not None]

In [None]:
# --- Main ---
# New image request
# New cohort
cohort_filepath = # TODO 'path_to_your_project/AccessionNumberList.csv'
cohortlist = get_accession_list(cohort_filepath)

# Existing images
existing_filepath = # TODO 'path_to_existing_MRI_list/20240612_dcm_dirlist_t1sag_t1ax_t2sag_t2ax_all_seqs.csv'
existinglist = get_accession_list(existing_filepath)

# Missing images
new_request_list = set(cohortlist) - set(existinglist)
new_request_df = pd.DataFrame()
new_request_df['accessionnumber'] = [x for x in new_request_list]

# Save accessionnumbers to request images
request_path = # TODO 'path_to_your_project/img_requests/20240613_img_request.csv'
new_request_df.to_csv(request_path)
new_request_df

## Copy to a directory to copy to rad server

In [None]:
# --- Functions ---
import shutil, errno

def copyanything(src, dst):
    try:
        try:
            shutil.copytree(src, dst)
        except OSError as exc: # python >2.5
            if exc.errno in (errno.ENOTDIR, errno.EINVAL):
                shutil.copy(src, dst)
            else: raise
    except Exception as e:
        print(e)

In [None]:
# --- Main ---
MRI_dir = config.get('sample_images_dir')
os.makedirs(MRI_dir, exist_ok=True)
print(MRI_dir)

orig_prefix = # TODO 'path_to_data_storage/data/storage'

# --- FOR DEBUG ---
# sample_orig_path = # TODO
# sample_new_path = sample_orig_path.replace(orig_prefix,MRI_dir)
# print(sample_new_path)

# copyanything(sample_orig_path,sample_new_path)
# --- END DEBUG ---

for orig_path in df_metadata['dirpath']:
    new_path = orig_path.replace(orig_prefix,MRI_dir)
    copyanything(orig_path,new_path)

# View Images

In [None]:
idx_list = [0,1,2,3]
for idx in idx_list:
    path = df_metadata.loc[idx]['dirpath']
    vol, metadata = get_volume_from_directory(path)
    print(np.shape(vol))
    #Display the pixel data on a grid:
    fig = plot_volume(vol,cols=6,scale=4)

In [None]:
# # Load DICOM files and sort by image position patient
# ds_list = []
# for dicom_path in dicom_files:
#     ds = pydicom.read_file(str(dicom_path))
#     ds_list.append(ds)
# ds_list.sort(key=lambda ds: ds.ImagePositionPatient[2])

# #Display the pixel data on a grid:
# num_images = len(dicom_files)
# cols = 10
# rows = int(num_images / cols) + 1
# scale = 3
# plt.figure(figsize=(cols * scale, rows * scale))

# print(f'Patient ID: {patient}, Exam: {exam}, Series: {description}')
# for idx, ds in enumerate(ds_list):
#     plt.subplot(rows, cols, idx + 1)
#     plt.imshow(ds.pixel_array, cmap = 'gray')
#     plt.axis('off')