In [0]:
# Install dependencies
%pip install tensorflow opencv-python tqdm

In [0]:
# Import necessary modules

from PIL import Image
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
import random
import cv2
from azure.storage.filedatalake import DataLakeServiceClient

In [0]:
# Assign storage account and containers values to var
account_name = "rougestorageacc1"
account_key = dbutils.secrets.get(scope="rouge4kv1", key="rougestorageacc1key1")
source_container = "stagekaggledata"
dest_container = "processkaggledata"
train_ratio = 0.8

In [0]:
# Authenticate using account key
service_client = DataLakeServiceClient(
    account_url=f"https://{account_name}.dfs.core.windows.net",
    credential=account_key
)

source_fs = service_client.get_file_system_client(file_system=source_container)
dest_fs = service_client.get_file_system_client(file_system=dest_container)

In [0]:
# Split the data for training and validation into a new destination container and log the number of images in the resulting classes. 

# SUMMARY LOG
split_summary = {}

# Process each class folder
class_folders = [p.name for p in source_fs.get_paths(path="", recursive=False) if p.is_directory]

for class_folder in class_folders:
    print(f"\n🔍 Processing class: {class_folder}")
    
    images = [f.name for f in source_fs.get_paths(path=class_folder, recursive=False) if not f.is_directory]
    total = len(images)

    if total == 0:
        print(f"⚠️  Skipping {class_folder}: No images found.")
        continue

    random.shuffle(images)
    split_idx = int(total * train_ratio)
    train_imgs = images[:split_idx]
    val_imgs = images[split_idx:]

    split_summary[class_folder] = {
        "total": total,
        "train": len(train_imgs),
        "validation": len(val_imgs)
    }

    # Copy to train
    for img_path in train_imgs:
        img_name = os.path.basename(img_path)
        dest_path = f"train/{class_folder}/{img_name}"
        src_client = source_fs.get_file_client(img_path)
        dest_client = dest_fs.get_file_client(dest_path)

        dest_client.create_file()
        dest_client.append_data(src_client.download_file().readall(), 0)
        dest_client.flush_data(src_client.get_file_properties().size)

    # Copy to validation
    for img_path in val_imgs:
        img_name = os.path.basename(img_path)
        dest_path = f"validation/{class_folder}/{img_name}"
        src_client = source_fs.get_file_client(img_path)
        dest_client = dest_fs.get_file_client(dest_path)

        dest_client.create_file()
        dest_client.append_data(src_client.download_file().readall(), 0)
        dest_client.flush_data(src_client.get_file_properties().size)

    print(f"✅ Split complete for '{class_folder}': {len(train_imgs)} train, {len(val_imgs)} validation")

# Summary Report
print("\n📊 Split Summary:")
for cls, stats in split_summary.items():
    print(f" - {cls}: Total={stats['total']} | Train={stats['train']} | Validation={stats['validation']}")

In [0]:
# verify that the dataset is available in the specified storage account/container.
spark.conf.set(
    f"fs.azure.account.key.{account_name}.dfs.core.windows.net",
    f"{account_key}"
)

display(dbutils.fs.ls(f"abfss://{dest_container}@{account_name}.dfs.core.windows.net/train"))

In [0]:
# Assign values to var
catalog = "main"
schema = "ml_data"
volume_name = "processkaggledata"

In [0]:
# Config
to_crop = ["train", "validation"]
root_path = f"/Volumes/{catalog}/{schema}/{volume_name}"
s = 160     # side-length of crop (tunable)
stride = 32 # sliding window stride (can also tune)


In [0]:
# Define image processing functions

# Get HSV feature map
def get_hsv_feature_map(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    s_channel = hsv[:, :, 1].astype(np.float32)  # Saturation channel
    return s_channel

# Get the best coordinates for cropping an image, img, given its feature map, window size (s x s) and stride.
def get_best_crop(img, feature_map, s, stride):
    h, w = feature_map.shape
    max_score = -1
    best_crop_coords = (0, 0)

    for y in range(0, h - s + 1, stride):
        for x in range(0, w - s + 1, stride):
            window = feature_map[y:y + s, x:x + s]
            score = np.sum(window)

            if score > max_score:
                max_score = score
                best_crop_coords = (x, y)

    x, y = best_crop_coords
    return img[y:y + s, x:x + s]


In [0]:
# Scan and crop images from the dataset

for dataset in to_crop:
    source_root = f"{root_path}/{dataset}"
    target_root = f"{root_path}/{dataset}_cropped"
    for class_name in tqdm(os.listdir(source_root)):
        class_path = os.path.join(source_root, class_name)
        if not os.path.isdir(class_path):
            continue

        # Create same class folder in output
        target_class_path = os.path.join(target_root, class_name)
        os.makedirs(target_class_path, exist_ok=True)

        for fname in os.listdir(class_path):
            if not fname.lower().endswith(('.jpg', '.jpeg', '.png')):
                continue

            src_path = os.path.join(class_path, fname)
            dst_path = os.path.join(target_class_path, fname)

            try:
                img = cv2.imread(src_path)
                if img is None:
                    continue

                feature_map = get_hsv_feature_map(img)

                # Skip if image smaller than crop size
                if img.shape[0] < s or img.shape[1] < s:
                    continue

                crop = get_best_crop(img, feature_map, s=s, stride=stride)

                # Save cropped image
                cv2.imwrite(dst_path, crop)

            except Exception as e:
                print(f"Failed to process {src_path}: {e}")

The following code is used to probe/verify the size of the images in the dataset.

In [0]:
# Get an overview of the image dimensions in the dataset

image_dir = f"/Volumes/{catalog}/{schema}/{volume_name}/train"
sizes = []

for root, _, files in os.walk(image_dir):
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            path = os.path.join(root, file)
            try:
                with Image.open(path) as img:
                    sizes.append(img.size)  # (width, height)
            except Exception as e:
                print(f"Error reading {path}: {e}")

# Quick summary
widths, heights = zip(*sizes)
print(f"Total images: {len(sizes)}")
print(f"Min size: {min(widths)}x{min(heights)}")
print(f"Max size: {max(widths)}x{max(heights)}")
print(f"Avg size: {sum(widths)//len(widths)}x{sum(heights)//len(heights)}")

In [0]:
# Extract a region of interest for a given input image. Returns the cropped image, the coordinates used to crop the image and the max score (max sum of the array values over the feature map) associated with it. 

def find_feature_dense_crop(image_path, s=64, stride=32):
    """
    Given an image path, find a s×s crop with highest edge density.
    """
    # Load and convert image to grayscale
    img = cv2.imread(image_path)
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    s_channel = hsv[:, :, 1]
    edge_map = s_channel.astype(np.float32)

    h, w = edge_map.shape
    max_score = -1
    best_crop_coords = (0, 0)

    # Slide a window and sum edge intensities
    for y in range(0, h - s + 1, stride):
        for x in range(0, w - s + 1, stride):
            window = edge_map[y:y+s, x:x+s]
            score = np.sum(window)

            if score > max_score:
                max_score = score
                best_crop_coords = (x, y)

    # Extract crop from original image
    x, y = best_crop_coords
    cropped_img = img[y:y+s, x:x+s]
    return cropped_img, best_crop_coords, max_score


In [0]:
# Display the auto-cropped image to verify the results. 

import matplotlib.pyplot as plt

sample_img = f"/Volumes/{catalog}/{schema}/{volume_name}/train/brownspot/BROWNSPOT1_013.jpg"
crop, coords, score = find_feature_dense_crop(sample_img)

print(f"Crop top-left: {coords}, Feature density score: {score}")
plt.imshow(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
plt.title("Auto-cropped high-density region")
plt.axis(False)
plt.show()