In [None]:
# ===============================================================
# ADVANCED MULTI-CORE FUNDUS IMAGE PREPROCESSING PIPELINE
# FOR DIABETIC RETINOPATHY DETECTION (MAX QUALITY + SPEED)
# NOTE: This cell is adapted to run reliably inside Jupyter/VSCode notebooks
# ===============================================================

import cv2
import os
import numpy as np
from tqdm import tqdm
from multiprocessing import cpu_count
from functools import partial
from concurrent.futures import ThreadPoolExecutor, as_completed

# ===============================================================
# 1. ADVANCED PREPROCESSING FUNCTIONS (Robust, notebook-friendly)
# ===============================================================

def crop_black(img, thresh=15):
    """Crop to the largest bright region (retina). Uses threshold + contours.

    This is more robust than findNonZero on raw gray values which can fail
    when images are not strictly zero-padded.
    """
    if img is None:
        return img
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # threshold to get the foreground (retina) - small thresh handles near-black backgrounds
    _, bw = cv2.threshold(gray, thresh, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(bw, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return img
    # pick largest contour (assumed retina)
    c = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(c)
    # guard against empty crop
    if w == 0 or h == 0:
        return img
    return img[y:y+h, x:x+w]


def circular_mask(img, padding_factor=0.98):
    """Apply circular crop around the largest bright region's centroid.

    Finds the largest contour and uses its centroid and minEnclosingCircle to create a mask.
    Falls back to image center/radius if contour detection fails.
    """
    h, w = img.shape[:2]
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, bw = cv2.threshold(gray, 10, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(bw, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        c = max(contours, key=cv2.contourArea)
        # robust center via moments
        M = cv2.moments(c)
        if M.get('m00', 0) != 0:
            cx = int(M['m10'] / M['m00'])
            cy = int(M['m01'] / M['m00'])
        else:
            cx, cy = w // 2, h // 2
        # use minEnclosingCircle to get an appropriate radius
        (_, _), r = cv2.minEnclosingCircle(c)
        radius = int(min(r * padding_factor, min(w, h) // 2))
    else:
        cx, cy = w // 2, h // 2
        radius = min(w, h) // 2

    mask = np.zeros((h, w), np.uint8)
    cv2.circle(mask, (cx, cy), radius, 255, -1)
    # apply mask
    return cv2.bitwise_and(img, img, mask=mask)


def clahe_enhance(img):
    """Improve contrast and vessel visibility using CLAHE on L channel."""
    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    cl = clahe.apply(l)
    merged = cv2.merge((cl, a, b))
    return cv2.cvtColor(merged, cv2.COLOR_LAB2BGR)


def gamma_correction(img, gamma=1.2):
    """Brightness correction to normalize illumination."""
    invGamma = 1.0 / gamma
    table = np.array([(i / 255.0) ** invGamma * 255 for i in range(256)]).astype("uint8")
    return cv2.LUT(img, table)


def preprocess_fundus(image_path, output_folder, input_folder, target_size=512):
    """Main preprocessing pipeline for a single image.

    Returns True on success, False on failure.
    """
    img = cv2.imread(image_path)
    if img is None:
        print(f"⚠️ Could not read image: {image_path}")
        return False

    try:
        # Step 1: Crop black regions (robust)
        img = crop_black(img)

        # Step 2: Resize to target dimension
        img = cv2.resize(img, (target_size, target_size), interpolation=cv2.INTER_AREA)

        # Step 3: Circular crop (centered on retina)
        img = circular_mask(img)

        # Step 4: CLAHE enhancement
        img = clahe_enhance(img)

        # Step 5: Gamma correction
        img = gamma_correction(img, 1.2)

        # Step 6: Gaussian blur subtraction (for vessel sharpening)
        blur = cv2.GaussianBlur(img, (0, 0), 40)
        img = cv2.addWeighted(img, 4, blur, -4, 128)

        # Save processed image preserving folder structure
        relative_path = os.path.relpath(image_path, input_folder)
        save_path = os.path.join(output_folder, relative_path)
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        ok = cv2.imwrite(save_path, img)
        if not ok:
            print(f"⚠️ Failed to write image: {save_path}")
            return False
        return True

    except Exception as e:
        print(f"⚠️ Error processing {image_path}: {e}")
        return False


# ===============================================================
# 2. PARALLEL (notebook-friendly) PROCESSING FUNCTION
# ===============================================================

def process_all_images(input_folder, output_folder, num_workers=None, use_processes=False):
    """Process all images in dataset.

    Defaults to a ThreadPoolExecutor which is safe in notebooks. Set use_processes=True
    if you run this as a standalone script and prefer multiprocessing.
    """
    if num_workers is None:
        num_workers = max(1, cpu_count() - 1)

    # Gather all image paths
    image_paths = []
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.lower().endswith((".jpg", ".jpeg", ".png")):
                image_paths.append(os.path.join(root, file))

    print(f"🧠 Found {len(image_paths)} images. Workers={num_workers}, use_processes={use_processes}")
    os.makedirs(output_folder, exist_ok=True)

    if use_processes:
        # Multiprocessing path (best used when running as script, not in notebook)
        from multiprocessing import Pool
        with Pool(processes=num_workers) as pool:
            list(tqdm(
                pool.imap_unordered(
                    partial(preprocess_fundus, output_folder=output_folder, input_folder=input_folder),
                    image_paths
                ),
                total=len(image_paths),
                desc="🚀 Preprocessing (processes)"
            ))

    else:
        # Threaded path (safe inside notebooks / interactive shells)
        successes = 0
        with ThreadPoolExecutor(max_workers=num_workers) as exe:
            futures = {exe.submit(preprocess_fundus, p, output_folder, input_folder): p for p in image_paths}
            for f in tqdm(as_completed(futures), total=len(futures), desc="🚀 Preprocessing (threads)"):
                try:
                    if f.result():
                        successes += 1
                except Exception as e:
                    print(f"⚠️ Worker failed: {e}")

        print(f"✅ Successfully processed {successes}/{len(image_paths)} images and saved to: {output_folder}")


# ===============================================================
# 3. RUN PIPELINE – for your Kaggle DR dataset
# NOTE: When running inside a notebook, call process_all_images(..., use_processes=False)
# If running as a script (python m1.py), you can call with use_processes=True for true multiprocessing.
# ===============================================================

if __name__ == "__main__":
    # TRAIN SET
    input_folder_train = "C:/Users/kondk/Downloads/archive (2)/split_dataset/train"
    output_folder_train = "C:/Users/kondk/Downloads/archive (2)/split_dataset_processed/train"
    process_all_images(input_folder_train, output_folder_train, use_processes=True)

    # VALIDATION SET
    input_folder_val = "C:/Users/kondk/Downloads/archive (2)/split_dataset/val"
    output_folder_val = "C:/Users/kondk/Downloads/archive (2)/split_dataset_processed/val"
    process_all_images(input_folder_val, output_folder_val, use_processes=True)

    # TEST SET
    input_folder_test = "C:/Users/kondk/Downloads/archive (2)/split_dataset/test"
    output_folder_test = "C:/Users/kondk/Downloads/archive (2)/split_dataset_processed/test"
    process_all_images(input_folder_test, output_folder_test, use_processes=True)

🧠 Found 29840 images. Using 23 CPU cores...


🚀 Preprocessing Images:   0%|          | 0/29840 [00:00<?, ?it/s]