In [2]:
import pathlib
import shutil
import random
import os
import tensorflow as tf  # Use TensorFlow instead of OpenCV

# ========================
# Core Configuration - Unified Path Management
# ========================
# Local root directory (modify to your actual path)
ROOT_DIR = pathlib.Path(r"C:\Users\86183\Desktop\final\uva-machine-learning-25f-projects\team-30")

# Original dataset paths
dataset1 = ROOT_DIR / "data/original_data/edible-and-poisonous-fungi/versions/1"
dataset2 = ROOT_DIR / "data/original_data/image-mushroom-dataset/versions/1/image-mushroom-dataset"

# Target output paths (locally accessible)
TARGET_DIR = ROOT_DIR / "data/cleaned"
BINARY_DIR = TARGET_DIR / "mushroom_binary"  # Binary classification directory
TESTING_DIR = TARGET_DIR / "mushroom_testing"  # Testing set directory

# Configuration parameters
train_ratio = 0.8 
img_exts = {".jpg", ".jpeg", ".png", ".bmp", ".gif"}  # All formats supported by keras
random.seed(42)

# ========================
# Create Directory Structure
# ========================
# Binary classification directories (edible/poisonous)
(BINARY_DIR / "edible").mkdir(parents=True, exist_ok=True)
(BINARY_DIR / "poisonous").mkdir(parents=True, exist_ok=True)

# Testing set directories
(TESTING_DIR / "edible").mkdir(parents=True, exist_ok=True)
(TESTING_DIR / "poisonous").mkdir(parents=True, exist_ok=True)

# ========================
# Image Validation Function (TensorFlow-based)
# ========================
def is_valid_image(file_path):
    """Check if image file is valid (not corrupted) using TensorFlow"""
    try:
        # Read file content
        img_bytes = tf.io.read_file(str(file_path))
        # Decode image (auto-detect format, disable animation support)
        img = tf.image.decode_image(img_bytes, channels=3, expand_animations=False)
        # Check if image has valid dimensions (at least 1x1 pixel)
        if img.shape.rank != 3 or img.shape[0] < 1 or img.shape[1] < 1:
            return False
        return True
    except (tf.errors.InvalidArgumentError, tf.errors.NotFoundError) as e:
        print(f"Error checking file {file_path.name}: {str(e)}")
        return False
    except Exception as e:
        print(f"Unexpected error checking file {file_path.name}: {str(e)}")
        return False

# ========================
# Data Processing Functions
# ========================
def get_class_imgs(src_folder, class_names):
    """Get all valid image files for specified classes"""
    imgs = []
    for cls in class_names:
        # Recursively find directories containing class name
        for seg_sub in src_folder.rglob(cls):
            if seg_sub.is_dir():
                # Collect all image files
                for f in seg_sub.iterdir():
                    if f.suffix.lower() in img_exts and f.is_file():
                        # Only add valid images
                        if is_valid_image(f):
                            imgs.append(f)
                        else:
                            print(f"Skipping corrupted file: {f}")
    return imgs

def split_data(imgs, train_ratio=0.8):
    """Split images into training and testing sets"""
    random.shuffle(imgs)
    split_idx = int(len(imgs) * train_ratio)
    return imgs[:split_idx], imgs[split_idx:]

def copy_files(file_list, target_dir):
    """Copy files to target directory with unique names to avoid duplicates"""
    copied = 0
    for idx, img in enumerate(file_list):
        # Generate unique filename
        new_name = f"{idx:06d}_{img.name}"
        try:
            shutil.copy2(img, target_dir / new_name)  # copy2 preserves file attributes
            copied += 1
        except Exception as e:
            print(f"Warning: Failed to copy {img.name} - {str(e)}")
    return copied

# ========================
# Load and Process Data
# ========================
# Define class names (matching original dataset)
edible_classes = ["edible mushroom sporocarp", "edible sporocarp"]
poisonous_classes = ["poisonous mushroom sporocarp", "poisonous sporocarp"]

# Load all valid images
print("Loading dataset...")
dataset1_edible = get_class_imgs(dataset1, edible_classes)
dataset1_poison = get_class_imgs(dataset1, poisonous_classes)
dataset2_edible = get_class_imgs(dataset2, edible_classes)
dataset2_poison = get_class_imgs(dataset2, poisonous_classes)

# Merge datasets
all_edible = dataset1_edible + dataset2_edible
all_poison = dataset1_poison + dataset2_poison

# Validate data loading
if not all_edible:
    raise ValueError("No edible mushroom images found! Please check dataset paths")
if not all_poison:
    raise ValueError("No poisonous mushroom images found! Please check dataset paths")

# Split into training and testing sets
edible_train, edible_test = split_data(all_edible)
poison_train, poison_test = split_data(all_poison)

# ========================
# Copy Files to Corresponding Directories
# ========================
print("Copying files to binary classification directory...")
# Training set (for validation_split in image_dataset_from_directory)
copy_files(edible_train, BINARY_DIR / "edible")
copy_files(poison_train, BINARY_DIR / "poisonous")

# Independent testing set
print("Copying files to testing directory...")
copy_files(edible_test, TESTING_DIR / "edible")
copy_files(poison_test, TESTING_DIR / "poisonous")

# ========================
# Output Statistics
# ========================
print("\n" + "="*60)
print("Data processing completed!")
print("="*60)
print(f"Binary classification directory: {BINARY_DIR}")
print(f"  - Edible mushrooms: {len(os.listdir(BINARY_DIR / 'edible'))} images")
print(f"  - Poisonous mushrooms: {len(os.listdir(BINARY_DIR / 'poisonous'))} images")
print(f"\nTesting set directory: {TESTING_DIR}")
print(f"  - Edible mushrooms: {len(os.listdir(TESTING_DIR / 'edible'))} images")
print(f"  - Poisonous mushrooms: {len(os.listdir(TESTING_DIR / 'poisonous'))} images")
print(f"\nDataset path (use this in model code): {BINARY_DIR}")

Loading dataset...
Error checking file 092_43B354vYxm8.jpg: {{function_node __wrapped__DecodeImage_device_/job:localhost/replica:0/task:0/device:CPU:0}} jpeg::Uncompress failed. Invalid JPEG data or crop window. [Op:DecodeImage] name: 
Skipping corrupted file: C:\Users\86183\Desktop\final\uva-machine-learning-25f-projects\team-30\data\original_data\image-mushroom-dataset\versions\1\image-mushroom-dataset\seg_test\poisonous mushroom sporocarp\092_43B354vYxm8.jpg
Copying files to binary classification directory...
Copying files to testing directory...

Data processing completed!
Binary classification directory: C:\Users\86183\Desktop\final\uva-machine-learning-25f-projects\team-30\data\cleaned\mushroom_binary
  - Edible mushrooms: 1888 images
  - Poisonous mushrooms: 3572 images

Testing set directory: C:\Users\86183\Desktop\final\uva-machine-learning-25f-projects\team-30\data\cleaned\mushroom_testing
  - Edible mushrooms: 472 images
  - Poisonous mushrooms: 894 images

Dataset path (use