# Data Inspection and Cleaning Notebook

## Loading libraries

In [9]:
import os
import tensorflow as tf
import cv2

## 1. Data Inspection

### 1.1 Check data consistency

In [18]:
def check_dataset_consistency(base_path):
    """
    This function checks the consistency of the dataset structure:
    1. Each subfolder must contain two subdirectories: 'original' and 'no_bg'.
    2. Each subdirectory must contain exactly one image in JPEG format.
    3. The two images must have the same dimensions.
    
    """
    inconsistencies = []  # List to collect any issues found

    # Iterate through each subfolder in the dataset
    for subfolder in os.listdir(base_path):
        subfolder_path = os.path.join(base_path, subfolder)
        
        # Check if the item is a directory
        if not os.path.isdir(subfolder_path):
            inconsistencies.append(f"{subfolder_path} is not a folder.")
            continue
        
        # Check if the required subdirectories 'original' and 'no_bg' exist
        original_path = os.path.join(subfolder_path, "original")
        no_bg_path = os.path.join(subfolder_path, "no_bg")
        
        if not os.path.isdir(original_path) or not os.path.isdir(no_bg_path):
            inconsistencies.append(f"{subfolder_path} is missing 'original' or 'no_bg' folder.")
            continue
        
        # Retrieve all images in each folder
        original_images = [f for f in os.listdir(original_path) if f.endswith(".jpg") or f.endswith(".jpeg")]
        no_bg_images = [f for f in os.listdir(no_bg_path) if f.endswith(".jpg") or f.endswith(".jpeg")]
        
        # Ensure that each folder contains exactly one image
        if len(original_images) != 1 or len(no_bg_images) != 1:
            inconsistencies.append(f"{subfolder_path} does not have exactly one image in each folder.")
            continue
        
        # Get the full paths of the images
        original_image_path = os.path.join(original_path, original_images[0])
        no_bg_image_path = os.path.join(no_bg_path, no_bg_images[0])
        
        # Compare the dimensions of the two images
        try:
            img1 = cv2.imread(original_image_path)  # Load the first image
            img2 = cv2.imread(no_bg_image_path)    # Load the second image
            
            if img1 is None or img2 is None:
                inconsistencies.append(f"Images in {subfolder_path} could not be loaded.")
                continue
            
            if img1.shape[:2] != img2.shape[:2]:  # Compare height and width
                inconsistencies.append(f"Images in {subfolder_path} do not have the same dimensions.")
        except Exception as e:
            # Catch and log any errors during image loading or processing
            inconsistencies.append(f"Error processing images in {subfolder_path}: {e}")
    
    # Print the results
    if not inconsistencies:
        print("All dataset files are consistent!")
    else:
        print("Inconsistencies found:")
        for issue in inconsistencies:
            print(issue)

### 1.2 Identify corrrupted files

In [19]:
def find_problematic_images(folder_path):
    """
    Checks for problematic images in a folder using TensorFlow.
    """
    problematic_files = []

    # Walk through the directory
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                # Read and decode the image
                img_raw = tf.io.read_file(file_path)
                img = tf.image.decode_image(img_raw)
                _ = img.numpy()  # Force evaluation of the tensor
            except Exception as e:
                print(f"Problematic image found: {file_path} | Error: {e}")
                problematic_files.append(file_path)

    return problematic_files

## 2. Data Cleaning

### 2.1 Rename Images

In [20]:
def rename_images(base_path: str, starting_index: int = 1):
    """
    Renames images in the dataset to a consistent format.
    - 'hand_###_original.jpg' for original images.
    - 'hand_###_mask.jpg' for mask images.
    """
    
    counter = starting_index

    for subfolder in os.listdir(base_path):
        subfolder_path = os.path.join(base_path, subfolder)

        if not os.path.isdir(subfolder_path):
            continue

        # Paths for 'original' and 'no_bg' folders
        original_path = os.path.join(subfolder_path, "original")
        no_bg_path = os.path.join(subfolder_path, "no_bg")

        if not os.path.isdir(original_path) or not os.path.isdir(no_bg_path):
            continue

        # Retrieve image files
        original_images = [f for f in os.listdir(original_path) if f.endswith(".jpg") or f.endswith(".jpeg")]
        no_bg_images = [f for f in os.listdir(no_bg_path) if f.endswith(".jpg") or f.endswith(".jpeg")]

        if len(original_images) != 1 or len(no_bg_images) != 1:
            continue

        # Build new file names
        original_new_name = f"hand_{counter:03d}_original.jpg"
        mask_new_name = f"hand_{counter:03d}_mask.jpg"

        # Full paths for the current and new files
        original_image_path = os.path.join(original_path, original_images[0])
        mask_image_path = os.path.join(no_bg_path, no_bg_images[0])

        original_new_path = os.path.join(original_path, original_new_name)
        mask_new_path = os.path.join(no_bg_path, mask_new_name)

        try:
            # Rename the files
            os.rename(original_image_path, original_new_path)
            os.rename(mask_image_path, mask_new_path)

            """print(f"Renamed: {original_image_path} -> {original_new_path}")
            print(f"Renamed: {mask_image_path} -> {mask_new_path}")"""

            counter += 1

        except Exception as e:
            print(f"Error renaming files in {subfolder_path}: {e}")

    return "All files have been renamed correctly!"

# Main

In [16]:
if __name__ == "__main__":
    base_path = "../data/public_hand_dataset/"

    # Step 1: Check consistency
    inconsistencies = check_dataset_consistency(base_path)
    if inconsistencies:
        print("Inconsistencies found:")
        for issue in inconsistencies:
            print(issue)
    else:
        print("Dataset is consistent. Proceeding to rename images...")

        # Step 2: Rename images
        rename_images(base_path)


All dataset files are consistent!
Dataset is consistent. Proceeding to rename images...
Renamed: ../data/public_hand_dataset/100328385179725049535622994313026736131\original\hand.jpeg -> ../data/public_hand_dataset/100328385179725049535622994313026736131\original\hand_001_original.jpg
Renamed: ../data/public_hand_dataset/100328385179725049535622994313026736131\no_bg\hand.jpeg -> ../data/public_hand_dataset/100328385179725049535622994313026736131\no_bg\hand_001_mask.jpg
Renamed: ../data/public_hand_dataset/101246170554525927110592012192635617282\original\hand.jpeg -> ../data/public_hand_dataset/101246170554525927110592012192635617282\original\hand_002_original.jpg
Renamed: ../data/public_hand_dataset/101246170554525927110592012192635617282\no_bg\hand.jpeg -> ../data/public_hand_dataset/101246170554525927110592012192635617282\no_bg\hand_002_mask.jpg
Renamed: ../data/public_hand_dataset/102586421877817102033289037136558555138\original\hand.jpeg -> ../data/public_hand_dataset/1025864218778