In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import glob

In [2]:
# Define the main data path and the class subfolders
data_path = "/kaggle/input/lung-and-colon-cancer-histopathological-images/lung_colon_image_set/lung_image_sets"
classes = ['lung_aca', 'lung_n', 'lung_scc']  # Class subfolders

# Define the output base folder
output_base_folder = "/kaggle/working/data_after_prerocessing"

In [None]:
# Resize dimension and color normalization target stats
RESIZE_DIM = (224, 224) #or (256, 256) based on the model that u will use

# Use computed averages for normalization
lung_aca_mean = [159.11632787, 133.45948309, 230.42239682]
lung_aca_std = [40.84, 51.88, 21.86]

lung_n_mean = [201.32406849, 158.39889114, 194.89485492]
lung_n_std = [32.79, 51.90, 26.28]

lung_scc_mean = [151.9113276, 116.753016, 225.89104331]
lung_scc_std = [36.33, 38.97, 20.08]

# Calculate overall target mean and std by averaging the values
target_mean = np.mean([lung_aca_mean, lung_n_mean, lung_scc_mean], axis=0).tolist()
target_std = np.mean([lung_aca_std, lung_n_std, lung_scc_std], axis=0).tolist()

# Function for Reinhard color normalization
def reinhard_color_normalization(patch, target_mean, target_std):
    """Applies Reinhard color normalization to a patch."""
    # Convert the patch to LAB color space
    patch_lab = cv2.cvtColor(patch, cv2.COLOR_RGB2LAB)
    
    # Separate the LAB channels
    l, a, b = cv2.split(patch_lab)

    # Normalize each channel
    l_mean, l_std = np.mean(l), np.std(l)
    a_mean, a_std = np.mean(a), np.std(a)
    b_mean, b_std = np.mean(b), np.std(b)

    l = (l - l_mean) / l_std * target_std[0] + target_mean[0]
    a = (a - a_mean) / a_std * target_std[1] + target_mean[1]
    b = (b - b_mean) / b_std * target_std[2] + target_mean[2]

    # Merge channels and convert back to RGB
    normalized_patch_lab = cv2.merge((l, a, b))
    normalized_patch = cv2.cvtColor(normalized_patch_lab.astype(np.uint8), cv2.COLOR_LAB2RGB)

    return normalized_patch

# Function for Min-Max normalization
def min_max_normalization(image):
    image_float = image.astype(np.float32)
    min_val = np.min(image_float)
    max_val = np.max(image_float)
    normalized_image = (image_float - min_val) / (max_val - min_val)
    return normalized_image

# Function to apply Watershed segmentation after Min-Max normalization
def watershed_segmentation(patch):
    """Applies Watershed segmentation on a patch."""
    # Convert the patch to 8-bit unsigned integer format
    patch_uint8 = (patch * 255).astype(np.uint8)

    # Convert to grayscale
    gray = cv2.cvtColor(patch_uint8, cv2.COLOR_RGB2GRAY)
    
    # Apply Otsu's thresholding to create a binary image
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Remove small noise using morphological operations
    kernel = np.ones((3, 3), np.uint8)
    sure_bg = cv2.dilate(thresh, kernel, iterations=2)
    
    # Finding sure foreground area
    dist_transform = cv2.distanceTransform(thresh, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.7 * dist_transform.max(), 255, 0)
    
    # Finding unknown region
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)
    
    # Marker labelling
    _, markers = cv2.connectedComponents(sure_fg)
    
    # Add one to all markers to ensure the background is not zero
    markers = markers + 1
    
    # Mark the region of unknown with zero
    markers[unknown == 255] = 0
    
    # Apply the watershed algorithm
    patch_bgr = cv2.cvtColor(patch_uint8, cv2.COLOR_RGB2BGR)
    markers = cv2.watershed(patch_bgr, markers)
    
    # Mark boundary regions (watershed regions) in red
    patch_bgr[markers == -1] = [255, 0, 0]  # Mark boundaries with red
    
    # Convert back to RGB
    patch_segmented = cv2.cvtColor(patch_bgr, cv2.COLOR_BGR2RGB)
    
    return patch_segmented

In [4]:
# Process each class in parallel
for cls in classes:
    # Create the input folder for the class
    input_folder = os.path.join(data_path, cls)
    # Get all image paths for this class
    image_paths = glob.glob(os.path.join(input_folder, "*.*"))  # Match all files
    
    # List to store loaded images
    images = []

    # Load each image
    for image_path in image_paths:
        image = cv2.imread(image_path)
        if image is None:
            print(f"Error: Could not load image {image_path}")
            continue
        # Convert BGR to RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images.append(image)

    # Apply resizing to each image
    resized_images = [cv2.resize(image, RESIZE_DIM, interpolation=cv2.INTER_LANCZOS4) for image in images]

    # Apply color normalization to all resized images
    normalized_images = [reinhard_color_normalization(image, target_mean, target_std) for image in resized_images]

    # Apply Min-Max normalization to all normalized images
    min_max_normalized_images = [min_max_normalization(image) for image in normalized_images]

    # Apply Watershed segmentation to all min-max normalized images
    segmented_images = [watershed_segmentation(image) for image in min_max_normalized_images]

    # Create the output folder for this class
    output_folder = os.path.join(output_base_folder, cls)
    os.makedirs(output_folder, exist_ok=True)

    # Save each segmented image as an image file
    for i, image in enumerate(segmented_images):
        image_to_save = (image * 255).astype(np.uint8)  # Scale back to [0, 255]
        file_name = os.path.join(output_folder, f'segmented_image_{i + 1}.png')
        cv2.imwrite(file_name, image_to_save)

    print(f"Saved {len(segmented_images)} segmented images to '{output_folder}'")

Saved 5000 segmented images to '/kaggle/working/data_after_prerocessing/lung_aca'
Saved 5000 segmented images to '/kaggle/working/data_after_prerocessing/lung_n'
Saved 5000 segmented images to '/kaggle/working/data_after_prerocessing/lung_scc'
