### Create db where all mask except crack related are set to background (removed) and all crack classes are combined into one#

In [9]:
import os
import shutil
import numpy as np
from PIL import Image

# --- CONFIGURATION ---
BASE_DIR = '/app/data'
SOURCE_IMG_DIR = f'{BASE_DIR}/2026-01-19-defect_dataset/images'
SOURCE_LBL_DIR = f'{BASE_DIR}/2026-01-19-defect_dataset/labels'

# New output directories
OUT_BASE = f'{BASE_DIR}/combine_crack'
OUT_IMG_DIR = f'{OUT_BASE}/images'
OUT_LBL_DIR = f'{OUT_BASE}/labels'

# ‚ö†Ô∏è UPDATE THESE IDS: Enter the integer values for your crack classes
# Example: If cracks=1, alligator=2, severe=3, put [1, 2, 3]
TARGET_CLASS_IDS = [1, 2, 3] 

def combine_and_filter_dataset():
    # 1. Setup output directories
    if os.path.exists(OUT_BASE):
        print(f"Warning: Output folder {OUT_BASE} already exists.")
    os.makedirs(OUT_IMG_DIR, exist_ok=True)
    os.makedirs(OUT_LBL_DIR, exist_ok=True)

    # Get list of label files
    label_files = [f for f in os.listdir(SOURCE_LBL_DIR) if f.endswith('.png')]
    
    processed_count = 0
    skipped_count = 0

    print(f"Starting processing of {len(label_files)} files...")
    print(f"Target merging classes: {TARGET_CLASS_IDS}")

    for lbl_file in label_files:
        # Construct paths
        src_lbl_path = os.path.join(SOURCE_LBL_DIR, lbl_file)
        
        # 2. Load and Process Mask
        # Load as grayscale (L) to get integer class IDs
        mask = np.array(Image.open(src_lbl_path).convert('L'))
        
        # Create a new blank mask (all zeros/background)
        new_mask = np.zeros_like(mask)
        
        # 3. Merge Logic
        # Where the original mask contains ANY of the target classes, set new mask to 1
        mask_matches_target = np.isin(mask, TARGET_CLASS_IDS)
        new_mask[mask_matches_target] = 1
        
        # 4. Filter Logic (Skip if empty)
        # If the new mask has no cracks (value 1), we skip copying this file
        if not np.any(new_mask == 1):
            skipped_count += 1
            continue

        # 5. Save New Mask
        # Save as standard palette or grayscale png
        result_img = Image.fromarray(new_mask.astype(np.uint8))
        result_img.save(os.path.join(OUT_LBL_DIR, lbl_file))

        # 6. Copy Corresponding Image
        # We assume image has same basename but likely .jpg extension
        # (Handling both .jpg and .png for safety based on your previous code)
        img_name_jpg = lbl_file.replace('.png', '.jpg')
        img_name_png = lbl_file
        
        src_img_path = None
        dst_img_name = None

        if os.path.exists(os.path.join(SOURCE_IMG_DIR, img_name_jpg)):
            src_img_path = os.path.join(SOURCE_IMG_DIR, img_name_jpg)
            dst_img_name = img_name_jpg
        elif os.path.exists(os.path.join(SOURCE_IMG_DIR, img_name_png)):
            src_img_path = os.path.join(SOURCE_IMG_DIR, img_name_png)
            dst_img_name = img_name_png
        
        if src_img_path:
            shutil.copy(src_img_path, os.path.join(OUT_IMG_DIR, dst_img_name))
            processed_count += 1
        else:
            print(f"Warning: Label {lbl_file} has no matching image file. Skipped.")

    print("--- Processing Complete ---")
    print(f"‚úÖ Saved: {processed_count} images (containing merged cracks)")
    print(f"‚ùå Skipped: {skipped_count} images (background only / no cracks)")
    print(f"üìÇ Output location: {OUT_BASE}")

if __name__ == "__main__":
    combine_and_filter_dataset()

Starting processing of 7286 files...
Target merging classes: [1, 2, 3]
--- Processing Complete ---
‚úÖ Saved: 6081 images (containing merged cracks)
‚ùå Skipped: 1205 images (background only / no cracks)
üìÇ Output location: /app/data/combine_crack


In [10]:
def verify_output():
    print("\nüîé Verifying output classes...")
    label_files = os.listdir(OUT_LBL_DIR)
    global_unique_ids = set()
    errors = []

    for f in label_files:
        path = os.path.join(OUT_LBL_DIR, f)
        # Load the newly created mask
        mask = np.array(Image.open(path).convert('L'))
        unique = np.unique(mask)
        
        # Update global list of IDs found
        for u in unique:
            global_unique_ids.add(u)
        
        # Check for any value that is NOT 0 or 1
        if not np.all(np.isin(unique, [0, 1])):
            errors.append(f)

    print("-" * 30)
    print(f"Found unique class IDs across all files: {sorted(list(global_unique_ids))}")
    
    if len(errors) == 0 and global_unique_ids.issubset({0, 1}):
        print("‚úÖ SUCCESS: Dataset contains strictly 2 classes (0: Background, 1: Crack).")
    else:
        print(f"‚ùå FAILURE: Found unexpected classes or files with errors: {errors}")

verify_output()


üîé Verifying output classes...
------------------------------
Found unique class IDs across all files: [0, 1]
‚úÖ SUCCESS: Dataset contains strictly 2 classes (0: Background, 1: Crack).


In [11]:
# Create divisio train and test

import os
import random
# CONFIGURATION
# Make sure this matches your folder name
DATA_DIR = '/app/data/combine_crack'
IMG_DIR = os.path.join(DATA_DIR, 'images')
SPLITS_DIR = os.path.join(DATA_DIR, 'splits')

def create_train_val_split_from_folder():
    # 1. Setup paths
    os.makedirs(SPLITS_DIR, exist_ok=True)

    # 2. Scan folder for all images
    # We look for .jpg and .png to be safe
    all_images = [f for f in os.listdir(IMG_DIR) if f.lower().endswith(('.jpg', '.png'))]
    
    if not all_images:
        print(f"‚ùå Error: No images found in {IMG_DIR}")
        return

    print(f"Found {len(all_images)} images in folder. Creating random split...")

    # 3. Random Shuffle
    random.shuffle(all_images) 

    val_files = all_images[:100]
    remaining_images = all_images[100:]
    train_files = remaining_images[:1500]

    # 6. Strip extensions (remove .jpg/.png) for the text files
    # This is crucial for segmentation dataloaders that expect just the ID
    train_names = [os.path.splitext(f)[0] for f in train_files]
    val_names = [os.path.splitext(f)[0] for f in val_files]

    # 7. Write to files
    with open(os.path.join(SPLITS_DIR, 'train.txt'), 'w') as f:
        f.write('\n'.join(train_names))

    with open(os.path.join(SPLITS_DIR, 'val.txt'), 'w') as f:
        f.write('\n'.join(val_names))

    print(f"‚úÖ Random split files created in {SPLITS_DIR}")
    print(f"Stats: {len(train_names)} train images, {len(val_names)} val images.")

if __name__ == "__main__":
    create_train_val_split_from_folder()

Found 6081 images in folder. Creating random split...
‚úÖ Random split files created in /app/data/combine_crack/splits
Stats: 1500 train images, 100 val images.
