### Create db where all mask except crack related are set to background (removed) and all crack classes are combined into one#

In [None]:
import os
import shutil
import numpy as np
from PIL import Image

# --- CONFIGURATION ---
BASE_DIR = '/app/data'
SOURCE_IMG_DIR = f'{BASE_DIR}/2026-01-19-defect_dataset/images'
SOURCE_LBL_DIR = f'{BASE_DIR}/2026-01-19-defect_dataset/labels_full'

# Output directory changes based on your choice
OUT_BASE_MERGED = f'{BASE_DIR}/2026-01-19-defect_dataset/labels_cracks_merged'
OUT_BASE_MULTICLASS = f'{BASE_DIR}/2026-01-19-defect_dataset/labels_basic_defects'

TARGET_CLASS_IDS = [1, 2, 3, 4, 5, 6, 7, 13]

def combine_and_filter_dataset(merge=True):
    # Select path based on mode
    out_base = OUT_BASE_MERGED if merge else OUT_BASE_MULTICLASS

    if os.path.exists(out_base):
        print(f"Note: Output folder {out_base} already exists.")
    
    os.makedirs(out_base, exist_ok=True)

    label_files = [f for f in os.listdir(SOURCE_LBL_DIR) if f.endswith('.png')]
    processed_count = 0
    empty_count = 0

    print(f"üöÄ Mode: {'MERGE (Binary)' if merge else 'PRESERVE (Multiclass)'}")
    print(f"Processing {len(label_files)} files...")

    for lbl_file in label_files:
        src_lbl_path = os.path.join(SOURCE_LBL_DIR, lbl_file)
        mask = np.array(Image.open(src_lbl_path).convert('L'))
        
        # New Dynamic Logic
        if merge:
            # Flatten everything in TARGET_CLASS_IDS to 1
            new_mask = np.zeros_like(mask)
            new_mask[np.isin(mask, TARGET_CLASS_IDS)] = 1
        else:
            # Keep original IDs (1, 2, 3) for targets, everything else 0
            new_mask = np.where(np.isin(mask, TARGET_CLASS_IDS), mask, 0)
        
        # Skip if no target pixels exist in the final mask
        if not np.any(new_mask > 0):
            empty_count += 1

        # Save Label
        result_img = Image.fromarray(new_mask.astype(np.uint8))
        result_img.save(os.path.join(out_base, lbl_file))
        processed_count += 1

    print(f"‚úÖ Saved: {processed_count} images | ‚ùå Empty: {empty_count}")
    print(f"üìÇ Location: {out_base}")

if __name__ == "__main__":
    # Change to False to fix your spatial confusion/233px error!
    combine_and_filter_dataset(merge=False)

In [10]:
def verify_output():
    print("\nüîé Verifying output classes...")
    label_files = os.listdir(OUT_LBL_DIR)
    global_unique_ids = set()
    errors = []

    for f in label_files:
        path = os.path.join(OUT_LBL_DIR, f)
        # Load the newly created mask
        mask = np.array(Image.open(path).convert('L'))
        unique = np.unique(mask)
        
        # Update global list of IDs found
        for u in unique:
            global_unique_ids.add(u)
        
        # Check for any value that is NOT 0 or 1
        if not np.all(np.isin(unique, [0, 1])):
            errors.append(f)

    print("-" * 30)
    print(f"Found unique class IDs across all files: {sorted(list(global_unique_ids))}")
    
    if len(errors) == 0 and global_unique_ids.issubset({0, 1}):
        print("‚úÖ SUCCESS: Dataset contains strictly 2 classes (0: Background, 1: Crack).")
    else:
        print(f"‚ùå FAILURE: Found unexpected classes or files with errors: {errors}")

verify_output()


üîé Verifying output classes...
------------------------------
Found unique class IDs across all files: [0, 1]
‚úÖ SUCCESS: Dataset contains strictly 2 classes (0: Background, 1: Crack).


In [2]:
# Create divisio train and test

import os
import random
# CONFIGURATION
# Make sure this matches your folder name
DATA_DIR = '/app/data/multi_crack'
IMG_DIR = os.path.join(DATA_DIR, 'images')
SPLITS_DIR = os.path.join(DATA_DIR, 'splits')

def create_train_val_split_from_folder():
    # 1. Setup paths
    os.makedirs(SPLITS_DIR, exist_ok=True)

    # 2. Scan folder for all images
    # We look for .jpg and .png to be safe
    all_images = [f for f in os.listdir(IMG_DIR) if f.lower().endswith(('.jpg', '.png'))]
    
    if not all_images:
        print(f"‚ùå Error: No images found in {IMG_DIR}")
        return

    print(f"Found {len(all_images)} images in folder. Creating random split...")

    # 3. Random Shuffle
    random.shuffle(all_images) 

    val_files = all_images[:100]
    remaining_images = all_images[100:]
    train_files = remaining_images[:1500]

    # 6. Strip extensions (remove .jpg/.png) for the text files
    # This is crucial for segmentation dataloaders that expect just the ID
    train_names = [os.path.splitext(f)[0] for f in train_files]
    val_names = [os.path.splitext(f)[0] for f in val_files]

    # 7. Write to files
    with open(os.path.join(SPLITS_DIR, 'train.txt'), 'w') as f:
        f.write('\n'.join(train_names))

    with open(os.path.join(SPLITS_DIR, 'val.txt'), 'w') as f:
        f.write('\n'.join(val_names))

    print(f"‚úÖ Random split files created in {SPLITS_DIR}")
    print(f"Stats: {len(train_names)} train images, {len(val_names)} val images.")

if __name__ == "__main__":
    create_train_val_split_from_folder()

Found 6081 images in folder. Creating random split...
‚úÖ Random split files created in /app/data/multi_crack/splits
Stats: 1500 train images, 100 val images.
