In [1]:
import os
import pandas as pd
import shutil
import SimpleITK as sitk

from constants import Constants
constants = Constants()

from pathlib import Path

In [2]:
def get_patientID(filename):
    return filename[:14]


def get_excluded_filenames(filename):
    excluded_filenames = not 'traa' in filename 
    excluded_filenames = excluded_filenames and not 'trab' in filename 
    excluded_filenames = excluded_filenames and not 'loc' in filename
    excluded_filenames = excluded_filenames and not 'ADCa' in filename
    excluded_filenames = excluded_filenames and not 'Noise0' in filename
    excluded_filenames = excluded_filenames and not 'S3_2' in filename
    excluded_filenames = excluded_filenames and not 'Eq_1' in filename
    excluded_filenames = excluded_filenames and not 'copy' in filename
    return excluded_filenames


def get_image_type(filename):
    excluded_filenames = get_excluded_filenames(filename)
    
    if 't2' in filename and 'tra' in filename and excluded_filenames:
        return 'T2'
    if 'ADC' in filename and excluded_filenames:
        return 'ADC'
    if 'Ktrans' in filename:
        return 'KTrans'
    return 'unknown'

# Preprocess data

In [3]:
ROOT = constants.dataset_path

# Path to unprocessed data
UNPROCESSED = ROOT / 'unprocessed'


# Path to data converted using MRIcoGL
UNPROCESSED_NIFTI = UNPROCESSED / 'NIFTI'
# Path to KTrans data (*.mhd)
UNPROCESSED_KTRANS = UNPROCESSED / 'KTrans'


# Path to information
FINDINGS = ROOT / 'ProstateX-Findings-Train.csv'
LESION_INFO = Path('ProstateX_plotting/info_df.pickle')

# Path to processed data
PROCESSED = ROOT / 'processed'
if not PROCESSED.exists():
    os.mkdir(PROCESSED)

## Sort the data files

In [4]:
findings_data = []
patients = []

for file in os.listdir(UNPROCESSED_NIFTI):
    image_type = get_image_type(file)
    if image_type != 'unknown' and 'nii.gz' in file:
        patient_id = get_patientID(file)
        
        idx = int(patient_id[10:])
        if patient_id not in patients:
            patients.append(patient_id)
            findings_data.append({})
            findings_data[idx]['ProxID'] = patient_id
            findings_data[idx]['ClinSig'] = []
            findings_data[idx]['fid'] = []
            findings_data[idx]['pos'] = []
            findings_data[idx]['zone'] = []
            
        src = UNPROCESSED_NIFTI / file
        dest = PROCESSED / file
        findings_data[idx][image_type] = dest
        
        shutil.copyfile(src, dest)
        
print(f'All NIFTI T2-weighted and ADC files were stored at path: {PROCESSED}')

All NIFTI T2-weighted and ADC files were stored at path: data\processed


## Convert KTrans

In [5]:
def convert_mhd_to_nifti(file, input_path, output_path):
    mhd_image = os.path.join(input_path, file)
    img = sitk.ReadImage(mhd_image)
    
    nii_filename = file.replace('.mhd', '.nii.gz')
    img_nii_path = output_path / nii_filename
    
    sitk.WriteImage(img, nii_filename)
    shutil.move(nii_filename, img_nii_path)
    return img_nii_path


for root, _, files in os.walk(UNPROCESSED_KTRANS):
    for file in files:
        image_type = get_image_type(file)
        if image_type == 'KTrans' and 'mhd' in file:
            patient_id = get_patientID(file)
            idx = int(patient_id[10:])
            findings_data[idx][image_type] = convert_mhd_to_nifti(file, root, PROCESSED)
            
print(f'All converted KTrans files were stored at path: {PROCESSED}')

All converted KTrans files were stored at path: data\processed


## Combine information

In [6]:
findings_df = pd.read_csv(FINDINGS)
lesion_info_df = pd.read_pickle(LESION_INFO)

for i, patient in enumerate(findings_df.ProxID):
    idx = int(patient[10:])
    
    findings_row = findings_df.iloc[i]
    lesion_info_row = lesion_info_df.iloc[idx]
    
    findings_data[idx]['ClinSig'].append(findings_row.ClinSig)
    findings_data[idx]['fid'].append(findings_row.fid)
    findings_data[idx]['pos'].append(findings_row.pos)
    findings_data[idx]['zone'].append(findings_row.zone)
    
    # voxel spacing and slices contianing prostate lesion
    findings_data[idx]['spacing'] = lesion_info_row.spacing
    findings_data[idx]['slices'] = lesion_info_row.fg_slices

findings = pd.DataFrame(
    findings_data, 
    columns=[
        'ProxID', 'ClinSig', 'fid', 'pos', 'zone', 'spacing', 'slices', 'T2', 'ADC', 'KTrans'
    ]
)

## Create pickle

In [7]:
findings_pickle = PROCESSED / 'lesion_findings.pickle'

pd.to_pickle(findings, findings_pickle) 

print(f'The findings information can be found at: {findings_pickle}')

The findings information can be found at: data\processed\lesion_findings.pickle
