# Data Inspection and Cleaning Notebook

## Loading libraries

In [1]:
import os
import tensorflow as tf
import cv2

## 1. Data Inspection

### 1.1 Check data consistency

In [2]:
def check_dataset_consistency(base_path):
    """
    This function checks the consistency of the dataset structure:
    1. Each subfolder must contain two subdirectories: 'original' and 'no_bg'.
    2. Each subdirectory must contain exactly one image in JPEG format.
    3. The two images must have the same dimensions.
    
    """
    inconsistencies = []  # List to collect any issues found

    # Iterate through each subfolder in the dataset
    for subfolder in os.listdir(base_path):
        subfolder_path = os.path.join(base_path, subfolder)
        
        # Check if the item is a directory
        if not os.path.isdir(subfolder_path):
            inconsistencies.append(f"{subfolder_path} is not a folder.")
            continue
        
        # Check if the required subdirectories 'original' and 'no_bg' exist
        original_path = os.path.join(subfolder_path, "original")
        no_bg_path = os.path.join(subfolder_path, "no_bg")
        
        if not os.path.isdir(original_path) or not os.path.isdir(no_bg_path):
            inconsistencies.append(f"{subfolder_path} is missing 'original' or 'no_bg' folder.")
            continue
        
        # Retrieve all images in each folder
        original_images = [f for f in os.listdir(original_path) if f.endswith(".jpg") or f.endswith(".jpeg")]
        no_bg_images = [f for f in os.listdir(no_bg_path) if f.endswith(".jpg") or f.endswith(".jpeg")]
        
        # Ensure that each folder contains exactly one image
        if len(original_images) != 1 or len(no_bg_images) != 1:
            inconsistencies.append(f"{subfolder_path} does not have exactly one image in each folder.")
            continue
        
        # Get the full paths of the images
        original_image_path = os.path.join(original_path, original_images[0])
        no_bg_image_path = os.path.join(no_bg_path, no_bg_images[0])
        
        # Compare the dimensions of the two images
        try:
            img1 = cv2.imread(original_image_path)  # Load the first image
            img2 = cv2.imread(no_bg_image_path)    # Load the second image
            
            if img1 is None or img2 is None:
                inconsistencies.append(f"Images in {subfolder_path} could not be loaded.")
                continue
            
            if img1.shape[:2] != img2.shape[:2]:  # Compare height and width
                inconsistencies.append(f"Images in {subfolder_path} do not have the same dimensions.")
        except Exception as e:
            # Catch and log any errors during image loading or processing
            inconsistencies.append(f"Error processing images in {subfolder_path}: {e}")
    
    # Print the results
    if not inconsistencies:
        print("All dataset files are consistent!")
    else:
        print("Inconsistencies found:")
        for issue in inconsistencies:
            print(issue)

### 1.2 Identify corrrupted files

In [3]:
def find_problematic_images(folder_path):
    """
    Checks for problematic images in a folder using TensorFlow.
    """
    problematic_files = []

    # Walk through the directory
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                # Read and decode the image
                img_raw = tf.io.read_file(file_path)
                img = tf.image.decode_image(img_raw)
                _ = img.numpy()  # Force evaluation of the tensor
            except Exception as e:
                print(f"Problematic image found: {file_path} | Error: {e}")
                problematic_files.append(file_path)

    return problematic_files

## 2. Data Cleaning

### 2.1 Renaming and moving Images

In [5]:
import os
import shutil



# Función para reorganizar archivos
def move_files(base_dir, images_dir, masks_dir):
    
    # Crear directorios de salida si no existen
    os.makedirs(images_dir, exist_ok=True)
    os.makedirs(masks_dir, exist_ok=True)

    subfolders = [f.path for f in os.scandir(base_dir) if f.is_dir()]
    image_count = 0  # Contador para renombrar las imágenes

    for subfolder in subfolders:
        original_dir = os.path.join(subfolder, "original")
        no_bg_dir = os.path.join(subfolder, "no_bg")

        if not os.path.exists(original_dir) or not os.path.exists(no_bg_dir):
            print(f"Skipping {subfolder}: Missing 'original' or 'no_bg' folder.")
            continue

        # Procesar imágenes originales y máscaras
        for img_file in os.listdir(original_dir):
            img_path = os.path.join(original_dir, img_file)
            mask_file = img_file  # Asumimos que la máscara tiene el mismo nombre
            mask_path = os.path.join(no_bg_dir, mask_file)

            if os.path.isfile(img_path) and os.path.isfile(mask_path):
                # Generar nuevos nombres para imágenes y máscaras
                new_img_name = f"{image_count:04d}_image.jpg"
                new_mask_name = f"{image_count:04d}_mask.png"

                # Mover archivos
                shutil.move(img_path, os.path.join(images_dir, new_img_name))
                shutil.move(mask_path, os.path.join(masks_dir, new_mask_name))

                image_count += 1
            else:
                print(f"Skipping: Image or mask missing for {img_file} in {subfolder}.")

    print(f"Reorganized {image_count} images and masks into {images_dir} and {masks_dir}.")

# Main

In [6]:
if __name__ == "__main__":
    
    # Directorios base
    base_path = "../data/public_hand_dataset/"    
    output_images_dir = "../data/images"
    output_masks_dir = "../data/masks"
    
    # Step 1: Check consistency
    inconsistencies = check_dataset_consistency(base_path)
    if inconsistencies:
        print("Inconsistencies found:")
        for issue in inconsistencies:
            print(issue)
    else:
        print("Dataset is consistent. Proceeding to rename images...")

        # Step 2: Rename images
        move_files(base_path, output_images_dir, output_masks_dir)


All dataset files are consistent!
Dataset is consistent. Proceeding to rename images...
Reorganized 150 images and masks into ../data/images and ../data/masks.
