In [1]:
import shutil
from glob import glob
from pathlib import Path
import numpy as np
import cv2
import hashlib
import os


In [2]:
def remove_duplicate_images() -> None:
    """
    Remove duplicate images from the 'no' and 'yes' folders.

    This function searches for duplicate images in the 'no' and 'yes' 
    folders within the brain_tumor_dataset/raw_images directory.
    The duplicates are identified by calculating the md5 hash of the image 
    array and comparing it to the hashes of all other images.
    The duplicates are then removed from the folder.
    """
    image_folder = (
        Path().cwd().parent
        / Path("data")
        / Path("brain_tumor_dataset")
        / Path("raw_images")
    )
        
    dupes = []
    hash_keys = {}

    class_folders = ["no", "yes"]

    for f in class_folders:
        _folder_path = image_folder / Path(f)   
        for i, image in enumerate(_folder_path.glob("*")):
            if image.is_file():
                img_array = np.array(cv2.imread(str(image)), dtype=object)
                file_hash = hashlib.md5(img_array).hexdigest()
                if file_hash not in hash_keys:
                    hash_keys[file_hash] = i
                else:
                    dupes.append((i, f"{hash_keys[file_hash]}: {image}"))

    print(f"Total number of duplicated images: {len(dupes)}")

    for si, di in dupes:
        os.remove(di.split(': ')[1])


In [3]:
def create_folder_structure() -> None:
    """
    Create the train/test/val folder structure for project images.

    This function creates the train/test/val folder structure within the 
    brain_tumor_dataset/formatted_images directory.
    Within each of these folders, it creates 'YES' and 'NO' subfolders 
    to store images of tumors and non-tumors, respectively.
    If the folders already exist, the function does not create them again.
    """
    current_wd = Path().cwd()
    image_folder = (
        current_wd.parent
        / Path("data")
        / Path("brain_tumor_dataset")
        / Path("formatted_images")
    )

    train_folder = image_folder / Path("TRAIN_IMGS")
    test_folder = image_folder / Path("TEST_IMGS")
    validation_folder = image_folder / Path("VALIDATION_IMGS")

    yes_folder = Path("YES")
    no_folder = Path("NO")

    folders = [train_folder, test_folder, validation_folder]
    sub_folders = [yes_folder, no_folder]

    for f in folders:
        if not f.exists():
            f.mkdir(parents=True)
            print(f"Folder - {str(f).split('/')[-1]} created.")
            for sf in sub_folders:
                sub_folder_dir = f / Path(sf)
                if not sub_folder_dir.exists():
                    sub_folder_dir.mkdir(parents=True)
                    print(f"SubFolder - {str(sf).split('/')[-1]} created.")
                else:
                    print(f"SubFolder - {str(sf).split('/')[-1]} already exists.")
        else:
            print(f"Folder - {str(f).split('/')[-1]} already exists.")


In [4]:
def populate_image_folders() -> None:
    """
    Populate the newly created image folders with those located in the source folders.

    This function copies the images from the 'yes' and 'no' folders within 
    the brain_tumor_dataset/raw_images directory
    into the 'TEST_IMGS', 'TRAIN_IMGS', and 'VALIDATION_IMGS' folders within the 
    brain_tumor_dataset/formatted_images directory.
    The images are split into the respective folders based on a 20% test split 
    and a 10% validation split of the total images.
    The images are also renamed to have the 'YES' or 'NO' prefix depending on 
    their class.
    """
    current_wd = Path().cwd()
    source_image_path = (
        current_wd.parent
        / Path("data")
        / Path("brain_tumor_dataset")
        / Path("raw_images")
    )

    classes = ["yes", "no"]

    for c in classes:
        class_folder = source_image_path / Path(c)
        image_folder_count = len(list(class_folder.glob("*")))
        test_split = int(round(image_folder_count * 0.2, 0))
        val_split = int(round((image_folder_count - test_split) * 0.1, 0))
        for (n, image_path) in enumerate(class_folder.glob("*")):
            img = image_path
            if n <= test_split:
                shutil.copy(
                    img,
                    source_image_path.parent
                    / Path("formatted_images")
                    / Path("TEST_IMGS")
                    / Path(c.upper()),
                )
            elif n <= image_folder_count - val_split:
                shutil.copy(
                    img,
                    source_image_path.parent
                    / Path("formatted_images")
                    / Path("TRAIN_IMGS")
                    / Path(c.upper()),
                )
            else:
                shutil.copy(
                    img,
                    source_image_path.parent
                    / Path("formatted_images")
                    / Path("VALIDATION_IMGS")
                    / Path(c.upper()),
                )


In [5]:
def tidy_filenames_folders() -> None:
    """
    Tidy the filenames of the newly populated test/train/val folders.

    This function renames the images within the 'TRAIN_IMGS', 'TEST_IMGS', 
    and 'VALIDATION_IMGS' folders to remove any spaces and convert the 
    filenames to uppercase.
    """
    current_wd = Path().cwd()
    image_folder = (
        current_wd.parent
        / Path("data")
        / Path("brain_tumor_dataset")
        / Path("formatted_images")
    )

    parent_folders = ["TRAIN_IMGS", "TEST_IMGS", "VALIDATION_IMGS"]
    child_folders = ["YES", "NO"]

    for pf in parent_folders:
        for cf in child_folders:
            image_location = image_folder / Path(pf) / Path(cf)
            for img in image_location.glob("*"):
                split_filename = str(img).split("/")
                new_filename = "/".join(split_filename[:-1])
                altered_filename = split_filename[-1].replace(" ", "_").upper()
                final_filename = new_filename + "/" + altered_filename
                img.rename(final_filename)


In [6]:
remove_duplicate_images()

Total number of duplicated images: 25


In [7]:
create_folder_structure()


Folder - TRAIN_IMGS created.
SubFolder - YES created.
SubFolder - NO created.
Folder - TEST_IMGS created.
SubFolder - YES created.
SubFolder - NO created.
Folder - VALIDATION_IMGS created.
SubFolder - YES created.
SubFolder - NO created.


In [8]:
populate_image_folders()


In [21]:
tidy_filenames_folders()
