### Preprocessing (Resizing, Augmentation and Class Balancing)

In [1]:
import os

# Check how many images in each folder
base_dir = "datasets/iqothnccd-lung-cancer-dataset/versions/2/The IQ-OTHNCCD lung cancer dataset/The IQ-OTHNCCD lung cancer dataset"
labels = ["Benign cases", "Malignant cases", "Normal cases"]
for class_name in labels:
    class_folder = os.path.join(base_dir, class_name)
    files = [f for f in os.listdir(class_folder)]
    print(len(files))

120
561
416


In [2]:
import shutil
import random 
from PIL import Image, ImageEnhance

output_dir = "./datasets/preprocessed_data/"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir)

target_count = 600
img_size = (512, 512) #Augmentation

def get_random_transform():
    options = [
        lambda img: img.transpose(Image.FLIP_LEFT_RIGHT),  # Horizontal flip
        lambda img: img.transpose(Image.FLIP_TOP_BOTTOM),  # Vertical flip
        lambda img: img.rotate(random.uniform(-30, 30)),  # Random rotation
        lambda img: ImageEnhance.Contrast(img).enhance(random.uniform(1.2, 1.8)),  # Random contrast
        lambda img: ImageEnhance.Sharpness(img).enhance(random.uniform(1.2, 2.0)),  # Random sharpness
        lambda img: ImageEnhance.Color(img).enhance(random.uniform(1.5, 2.5)),  # Random color enhancement
    ]
    return random.choice(options)

def augment_and_save(class_name, target_count):
    class_src = os.path.join(base_dir, class_name)
    class_dst = os.path.join(output_dir, class_name)
    os.makedirs(class_dst)

    images = [f for f in os.listdir(class_src)]
    original_count = len(images)
    
    for img in images:
        shutil.copy(os.path.join(class_src, img), os.path.join(class_dst, img))
    extra_needed = target_count - original_count
    if extra_needed <= 0:
        return
        
    print(f"Need to generate {extra_needed} more images for {class_name}")
        
    for i in range(extra_needed):
        img_name = random.choice(images)
        try:
            with Image.open(os.path.join(class_src, img_name)) as img:
                img = img.convert("RGB").resize(img_size)
                transformed_img = get_random_transform()(img)
                save_name = f"aug{i}_{img_name}"
                transformed_img.save(os.path.join(class_dst, save_name))
        except Exception as e:
            print(f"Error while augmenting: {e}")
for class_name in labels:
    augment_and_save(class_name, target_count)
print("Augmentation complete for all classes!.")

Need to generate 480 more images for Benign cases
Need to generate 39 more images for Malignant cases
Need to generate 184 more images for Normal cases
Augmentation complete for all classes!.


The Datasets brain tumor and breast cancer are already preprocessed. The PROSTATE_MRI datasets needs full preprocessing.

# Breast Cancer

In [ ]:
breast_cancer_dir =  "./datasets/breast-cancer-detection/versions/1/"
# Move preprocessed data to ./datasets/preprocessed
