In [1]:
import os
import random
import cv2
import numpy as np
from natsort import natsorted
from tqdm import tqdm

## Data creation

function creation

In [2]:
def fill_large_contours(image, binary_mask, area_threshold):
    # Detect contours in the binary mask
    contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Return the original mask if no contours are found
    if not contours:
        return binary_mask
    
    # Find the largest contour by area
    largest_contour = max(contours, key=cv2.contourArea)
    
    # Fill the largest contour if its area exceeds the threshold
    if cv2.contourArea(largest_contour) > area_threshold:
        mask = np.zeros_like(binary_mask)
        cv2.drawContours(mask, [largest_contour], -1, 255, thickness=cv2.FILLED)
        return mask
    else:
        return None


In [3]:
def center_and_scale(img, mask, desired_size=None):
    # Find contours in the mask
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return img  # Return original if no contours found

    # Calculate the combined bounding box around all contours
    min_x, min_y = img.shape[1], img.shape[0]
    max_x = max_y = 0
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        min_x, min_y = min(x, min_x), min(y, min_y)
        max_x, max_y = max(x + w, max_x), max(y + h, max_y)
    
    # Crop the original image around the combined bounding box
    cropped_img = img[min_y:max_y, min_x:min_x]

    # Use original image size if desired_size is not specified
    if desired_size is None:
        desired_size = img.shape[:2]

    # Calculate the scaling factor while maintaining the aspect ratio
    h, w = cropped_img.shape[:2]
    aspect_ratio = w / h
    desired_h, desired_w = desired_size
    if desired_w / desired_h > aspect_ratio:
        new_h = desired_h
        new_w = int(aspect_ratio * new_h)
    else:
        new_w = desired_w
        new_h = int(new_w / aspect_ratio)

    # Scale the cropped image to the new size
    scaled_img = cv2.resize(cropped_img, (new_w, new_h), interpolation=cv2.INTER_AREA)

    # Create a new blank image with the desired size
    if len(img.shape) == 2:  # Grayscale image
        final_img = np.zeros((desired_h, desired_w), dtype=img.dtype)
    else:  # Color image
        final_img = np.zeros((desired_h, desired_w, img.shape[2]), dtype=img.dtype)
    
    # Centre the scaled image on the new blank image
    x_offset = (desired_w - new_w) // 2
    y_offset = (desired_h - new_h) // 2
    final_img[y_offset:y_offset + new_h, x_offset:x_offset + new_w] = scaled_img

    return final_img


In [5]:
folder_path = './data/sub_files/A1_50kV'
file_names = natsorted(os.listdir(folder_path))  # Cut off first and last 200 images to remove empty images
print(len(file_names))

2482


## CT Scans

In [10]:
# Define the folder paths
folder_path = './data/sub_files/C1_50kV'
output_path = './data/preproc/C1_50kV'

# List all files and select the required sample
file_names = natsorted(os.listdir(folder_path))[150:-150]  # Cut off first and last 150 images

# Create the output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Initialize counters for skipped images
skipped_images = 0
contour_skip = 0

# Iterate through all selected files
for filename in tqdm(file_names, desc='Processing images'):
    file_path = os.path.join(folder_path, filename)
    processed_file_path = os.path.join(output_path, filename)

    if os.path.isfile(file_path):
        # Load the image
        image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
        if image is not None:
            # Apply a global threshold to isolate the circular area
            _, thresholded = cv2.threshold(image, 42, 255, cv2.THRESH_BINARY)   # set this threshold depending on the input data (values used: 50, 22, 42)

            # Skip this image if the thresholded area is less than 9000
            if cv2.countNonZero(thresholded) < 9000:
                skipped_images += 1
                continue

            # Use dilation to enlarge the regions within the threshold
            kernel = np.ones((5, 5), np.uint8)
            dilated_image = cv2.dilate(thresholded, kernel, iterations=3)

            filled = fill_large_contours(image, dilated_image, 25000)
            if filled is None:
                contour_skip += 1
                continue

            # Apply morphological operations to clean the mask
            cleaned = cv2.morphologyEx(filled, cv2.MORPH_OPEN, kernel, iterations=2)
            closing = cv2.morphologyEx(cleaned, cv2.MORPH_CLOSE, kernel, iterations=3)

            # Apply Gaussian blurring to the cleaned image
            blurred = cv2.GaussianBlur(closing, (15, 15), 0)
            blurred_filled = fill_large_contours(image, blurred, 20000)

            # Mask the original image with the processed mask
            result_img = cv2.bitwise_and(image, image, mask=blurred_filled)
            result_img = center_and_scale(result_img, blurred_filled, (512, 512))

            # Save the processed image
            cv2.imwrite(processed_file_path, result_img)
        else:
            print(f"Unable to load image: {filename}")

# Print summary of skipped images
print(f"Skipped {skipped_images} images due to small thresholded area.")
print(f"Skipped {contour_skip} images due to insufficient contour area.")

Processing images:   0%|          | 0/1201 [00:00<?, ?it/s]

Processing images:  79%|███████▉  | 954/1201 [02:24<00:28,  8.63it/s]

Unable to load image: C1_50kV_1103.png


Processing images: 100%|██████████| 1201/1201 [03:00<00:00,  6.64it/s]

Skipped 0 images due to small thresholded area.
Contour 0 images skipped.





## Histology scans

In [4]:
def process_images(input_folder, output_folder, prefix='', desired_size=(512, 512)):
    # Create the output directory if it does not exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Counter for skipped images
    skipped_count = 0
    
    # Get a list of all image files in the input folder
    image_files = [f for f in os.listdir(input_folder) if f.endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))]
    
    for image_file in tqdm(image_files, desc="Processing images"):
        image_path = os.path.join(input_folder, image_file)
        img = cv2.imread(image_path)
        
        if img is None:
            print(f"Failed to read {image_path}")
            continue
        
        # Check if the background is white (top-left corner)
        if np.all(img[0, 0] == [255, 255, 255]):
            skipped_count += 1
            continue
        
        h, w = img.shape[:2]
        scale = min(desired_size[0] / h, desired_size[1] / w)
        
        # Resize the image if it's larger than the desired size
        if h > desired_size[0] or w > desired_size[1]:
            new_h, new_w = int(h * scale), int(w * scale)
            img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
        else:
            new_h, new_w = h, w
        
        # Calculate padding to center the image
        top_pad = (desired_size[0] - new_h) // 2
        bottom_pad = desired_size[0] - new_h - top_pad
        left_pad = (desired_size[1] - new_w) // 2
        right_pad = desired_size[1] - new_w - left_pad
        
        # Apply padding to center the image
        padded_img = cv2.copyMakeBorder(img, top_pad, bottom_pad, left_pad, right_pad, cv2.BORDER_CONSTANT, value=[0, 0, 0])
        
        # Save the processed image with the specified prefix
        output_path = os.path.join(output_folder, f"{prefix}{image_file}")
        cv2.imwrite(output_path, padded_img)
    
    print(f"Skipped {skipped_count} images with white backgrounds.")

# Define input and output folders and prefix
input_folder = './data/hist/ts19_EMA65'
output_folder = './data/preproc/TS19'
prefix = 'TS19_'

# Process the images
process_images(input_folder, output_folder, prefix=prefix, desired_size=(512, 512))

Processing images: 100%|██████████| 656/656 [00:36<00:00, 18.08it/s]

Skipped 41 images with white backgrounds.



