Move the preprocessed images into the folder that corresponds with their subject group (structure suitable for deep learning)

In [1]:
from pathlib import Path
from get_config import get_config_dict

config = get_config_dict()

parent_data_path = Path(config['raw_data_path']).parent
preprocessed_data_path = parent_data_path.joinpath('preprocessed_data')
preselected_data_path = parent_data_path.joinpath('preselected_data')
labeled_data_path = parent_data_path.joinpath('labeled_data')

## Down-Sampling

In [80]:
down_sampled_preprocesed_data_path = preprocessed_data_path.joinpath('preprocessed_data_down_sampled')
print(f'Number of scans in {down_sampled_preprocesed_data_path.name} directory:')
print(len(list(down_sampled_preprocesed_data_path.glob('**/*.nii.gz'))))
print(f'Number of slices in {down_sampled_preprocesed_data_path.name} directory:')
print(len(list(down_sampled_preprocesed_data_path.glob('**/*.tiff'))))

Number of scans in preprocessed_data_down_sampled directory:
405
Number of slices in preprocessed_data_down_sampled directory:
12150


In [3]:
# Create the destination folder structure (down_sampled)
down_sampled_labeled_data_path = labeled_data_path.joinpath('labeled_data_down_sampled')
down_sampled_ad_path = down_sampled_labeled_data_path.joinpath('AD')
down_sampled_mci_path = down_sampled_labeled_data_path.joinpath('MCI')
down_sampled_cn_path = down_sampled_labeled_data_path.joinpath('CN')
for path in [down_sampled_ad_path, down_sampled_mci_path, down_sampled_cn_path]:
    path.mkdir(parents=True, exist_ok=True)

down_sampled_group_paths = {
    'AD': down_sampled_ad_path,
    'MCI': down_sampled_mci_path,
    'CN': down_sampled_cn_path
}
down_sampled_group_paths

{'AD': PosixPath('/home/nacer/ADNI_data/ADNI1_Complete_3Yr/labeled_data/labeled_data_down_sampled/AD'),
 'MCI': PosixPath('/home/nacer/ADNI_data/ADNI1_Complete_3Yr/labeled_data/labeled_data_down_sampled/MCI'),
 'CN': PosixPath('/home/nacer/ADNI_data/ADNI1_Complete_3Yr/labeled_data/labeled_data_down_sampled/CN')}

In [4]:
import pandas as pd

# Read the subject groups csv
down_sampled_subject_groups_df = pd.read_csv('subject_groups_down_sampled.csv', index_col='Subject')
down_sampled_subject_groups_df.reset_index().groupby('Group').Subject.count()

Group
AD     62
CN     31
MCI    24
Name: Subject, dtype: int64

In [5]:
# Read the final images csv
down_sampled_final_images = pd.read_csv('final_image_ids_down_sampled.csv')
down_sampled_final_images.shape

(405, 3)

In [6]:
down_sampled_preselected_data_path = preselected_data_path.joinpath('preselected_data_down_sampled')
preselected_final_image_paths = []
for image_path in down_sampled_preselected_data_path.glob('**/*.nii*'):
    image_id = image_path.name.split('.')[0].split('_')[-1]
    assert image_id.startswith('I'), f'This is not an image id: {image_id}'
    
    if image_id in down_sampled_final_images['Image Data ID'].values:
        preselected_final_image_paths.append(image_path)

print(f'Number of down-sampled final image IDs that are in {down_sampled_preselected_data_path.name} directory:')
print(len(preselected_final_image_paths))

Number of down-sampled final image IDs that are in preselected_data_down_sampled directory:
405


In [7]:
preprocessed_final_image_paths = []
for image_path in down_sampled_preprocesed_data_path.glob('**/*.nii*'):
    image_id = image_path.name.split('.')[0].split('_')[-1]
    assert image_id.startswith('I'), f'This is not an image id: {image_id}'
    
    if image_id in down_sampled_final_images['Image Data ID'].values:
        preprocessed_final_image_paths.append(image_path)

print(f'Number of down-sampled final image IDs that are in {down_sampled_preprocesed_data_path.name} directory:')
print(int(len(preprocessed_final_image_paths)))

Number of down-sampled final image IDs that are in preprocessed_data_down_sampled directory:
405


In [8]:
from shutil import copy

if config['down_sample']:
    # Iterate over the preprocessed images and move them to the corresponding label folder
    subject_path_list = list(down_sampled_preprocesed_data_path.glob('*'))

    for subject_path in subject_path_list:
        subject_id = subject_path.name
        
        if subject_id in down_sampled_subject_groups_df.index:
            # Get subject group
            group = down_sampled_subject_groups_df.loc[subject_id, 'Group']
            subject_slice_path_list = subject_path.glob('**/*.tiff')
            
            # Move subject images
            for slice_path in subject_slice_path_list:
                image_id = slice_path.parent.name
                # Test if image is preselected
                if image_id in down_sampled_final_images['Image Data ID'].values:
                    # Movw image
                    dst_slice_path = down_sampled_group_paths[group]
                    if not dst_slice_path.exists():
                            copy(slice_path, dst_slice_path)
    

In [9]:
print(f'Total number of images in {down_sampled_labeled_data_path.name} directory:')
target_labeled_image_count = down_sampled_final_images.shape[0] * 30
print(len(list(down_sampled_labeled_data_path.glob('**/*.tiff'))), '/', target_labeled_image_count)

Total number of images in labeled_data_down_sampled directory:
12150 / 12150


In [10]:
for group, path in down_sampled_group_paths.items():
    print(f'Total number of images in {group} directory:')
    target_labeled_image_count = int(down_sampled_final_images.shape[0] * 30 / 3)
    print(len(list(path.glob('**/*.tiff'))), '/', target_labeled_image_count, end='\n\n')

Total number of images in AD directory:
4050 / 4050

Total number of images in MCI directory:
4050 / 4050

Total number of images in CN directory:


4050 / 4050



## Up-Sampling

In [11]:
up_sampled_preprocesed_data_path = preprocessed_data_path.joinpath('preprocessed_data_up_sampled')
print(f'Number of scans in {up_sampled_preprocesed_data_path.name} directory:')
print(len(list(up_sampled_preprocesed_data_path.glob('**/*.nii.gz'))))
print(f'Number of slices in {up_sampled_preprocesed_data_path.name} directory:')
print(len(list(up_sampled_preprocesed_data_path.glob('**/*.tiff'))))

Number of scans in preprocessed_data_up_sampled directory:


582
Number of slices in preprocessed_data_up_sampled directory:


17460


In [12]:
# Create the destination folder structure (up_sampled)
up_sampled_labeled_data_path = labeled_data_path.joinpath('labeled_data_up_sampled')
up_sampled_ad_path = up_sampled_labeled_data_path.joinpath('AD')
up_sampled_mci_path = up_sampled_labeled_data_path.joinpath('MCI')
up_sampled_cn_path = up_sampled_labeled_data_path.joinpath('CN')
for path in [up_sampled_ad_path, up_sampled_cn_path, up_sampled_mci_path]:
    path.mkdir(parents=True, exist_ok=True)

up_sampled_group_paths = {
    'AD': up_sampled_ad_path,
    'MCI': up_sampled_mci_path,
    'CN': up_sampled_cn_path
}
up_sampled_group_paths

{'AD': PosixPath('/home/nacer/ADNI_data/ADNI1_Complete_3Yr/labeled_data/labeled_data_up_sampled/AD'),
 'MCI': PosixPath('/home/nacer/ADNI_data/ADNI1_Complete_3Yr/labeled_data/labeled_data_up_sampled/MCI'),
 'CN': PosixPath('/home/nacer/ADNI_data/ADNI1_Complete_3Yr/labeled_data/labeled_data_up_sampled/CN')}

In [13]:
# Read the subject groups csv
up_sampled_subject_groups_df = pd.read_csv('subject_groups_up_sampled.csv', index_col='Subject')
up_sampled_subject_groups_df.reset_index().groupby('Group').Subject.count()

Group
AD      62
CN      81
MCI    100
Name: Subject, dtype: int64

In [14]:
# Up-sample the final images
up_sampled_final_images = pd.read_csv('final_image_ids_up_sampled.csv')
up_sampled_final_images.shape

(744, 3)

In [15]:
up_sampled_preselected_data_path = preselected_data_path.joinpath('preselected_data_up_sampled')
preselected_final_image_paths = []
for image_path in up_sampled_preselected_data_path.glob('**/*.nii'):
    image_id = image_path.name.split('.')[0].split('_')[-1]
    assert image_id.startswith('I'), f'This is not an image id: {image_id}'
    
    if image_id in up_sampled_final_images['Image Data ID'].values:
        preselected_final_image_paths.append(image_path)

print(f'Number of up-sampled final image IDs that are in {up_sampled_preselected_data_path.name} directory:')
print(len(preselected_final_image_paths))

Number of up-sampled final image IDs that are in preselected_data_up_sampled directory:
582


In [16]:
preprocessed_final_image_paths = []
for image_path in up_sampled_preprocesed_data_path.glob('**/*.nii*'):
    image_id = image_path.name.split('.')[0].split('_')[-1]
    assert image_id.startswith('I'), f'This is not an image id: {image_id}'
    if image_id in up_sampled_final_images['Image Data ID'].values:
        preprocessed_final_image_paths.append(image_path)

print(f'Number of up-sampled final image IDs that are in {up_sampled_preprocesed_data_path.name} directory:')
print(int(len(preprocessed_final_image_paths)))

Number of up-sampled final image IDs that are in preprocessed_data_up_sampled directory:
582


In [17]:
from shutil import copy

if config['up_sample']:
    image_ids_value_counts = up_sampled_final_images['Image Data ID'].value_counts()

    # Iterate over the preprocessed images and move them to the corresponding label folder
    subject_path_list = list(up_sampled_preprocesed_data_path.glob('*'))
    for subject_path in subject_path_list:
        subject_id = subject_path.name
        
        # Get subject group
        group = up_sampled_subject_groups_df.loc[subject_id, 'Group']
        
        # Get subject images
        subject_image_path_list = subject_path.glob('**/I*')
        for image_path in subject_image_path_list:
            image_id = image_path.name
            # Test if image is preselected
            if image_id in up_sampled_final_images['Image Data ID'].values:
                # Get the image repetitions
                image_copies = image_ids_value_counts[image_id]
                for i in range(image_copies):
                    slice_path_list = image_path.glob('**/*.tiff')
                    for slice_path in slice_path_list:
                        # Movw image
                        dst_slice_path = up_sampled_group_paths[group].joinpath(f'{slice_path.stem}_copy{i}.tiff')
                        if not dst_slice_path.exists():
                            copy(slice_path, dst_slice_path)
            

In [18]:
print(f'Total number of images in {up_sampled_labeled_data_path.name} directory:')
target_labeled_image_count = up_sampled_final_images.shape[0] * 30
print(len(list(up_sampled_labeled_data_path.glob('**/*.tiff'))), '/', target_labeled_image_count)

Total number of images in labeled_data_up_sampled directory:
22320 / 22320


In [81]:
for group, path in up_sampled_group_paths.items():
    print(f'Total number of images in {group} directory:')
    target_labeled_image_count = int(up_sampled_final_images.shape[0] * 30 / 3)
    print(len(list(path.glob('**/*.tiff'))), '/', target_labeled_image_count, end='\n\n')

Total number of images in AD directory:
7440 / 7440

Total number of images in MCI directory:
7440 / 7440

Total number of images in CN directory:
7440 / 7440

