In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
import shutil
from scipy.ndimage import map_coordinates, gaussian_filter

In [None]:
def is_roi_image(image_path, black_ratio_threshold=0.8):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    _, binary = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)

    unique_vals = np.unique(binary)
    if not np.array_equal(unique_vals, [0]) and not np.array_equal(unique_vals, [0, 255]):
        return False

    total_pixels = binary.size
    black_pixels = np.sum(binary == 0)
    black_ratio = black_pixels / total_pixels

    return black_ratio >= black_ratio_threshold

In [None]:
def extract_cropped_images(csv_path, base_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    df = pd.read_csv(csv_path)
    df = df[df['cropped image file path'].notna()]

    malformed_paths = []
    valid_copied = []

    for _, row in df.iterrows():
        try:
            rel_path = row['cropped image file path'].strip()
            parts = rel_path.split('/')
            if len(parts) < 3:
                malformed_paths.append(rel_path + " (format error)")
                continue

            # Get UID folder
            target_folder = parts[-2]
            folder_path = os.path.join(base_folder, target_folder)

            if not os.path.exists(folder_path):
                malformed_paths.append(rel_path + " (folder not found)")
                continue

            # Check if the crop image is valid
            all_images = [f for f in os.listdir(folder_path) if f.endswith(('.jpg', '.png'))]
            crop_image = next((f for f in all_images if f.startswith("1-")), None)

            # if crop_image is ROI image
            if is_roi_image(os.path.join(folder_path, crop_image)):
                crop_image = next((f for f in all_images if f.startswith("2-")), None)


            if crop_image is None or len(all_images) < 2:
                malformed_paths.append(rel_path + " (no valid crop image)")
                continue

            # Get the new filename
            patient_id = str(row["patient_id"]).strip()
            calc_type = str(row["calc type"]).strip().replace(" ", "_").replace("-", "_").upper()
            pathology = str(row["pathology"]).strip().replace(" ", "_").replace("-", "_").upper()
            filename = f"{patient_id}_{target_folder}_{calc_type}_{pathology}.png"

            # Copy the image to the new location
            src = os.path.join(folder_path, crop_image)
            dst = os.path.join(output_folder, filename)

            shutil.copy(src, dst)
            valid_copied.append(dst)

        except Exception as e:
            malformed_paths.append(f"{rel_path} (error: {e})")

    return len(valid_copied), malformed_paths

In [None]:
# train
extract_cropped_images(
    csv_path="calc_case_description_train_set.csv",
    base_folder="raw/image",
    output_folder="raw/cropped_images_all"
)

# test
extract_cropped_images(
    csv_path="calc_case_description_test_set.csv",
    base_folder="raw/image",
    output_folder="raw/cropped_images_all"
)