In [1]:
import kagglehub
import os
import numpy as np
import pandas as pd
import cv2
import tensorflow as tf
import PIL
print(PIL.__version__)  # Check if it’s installed correctly

# Get data from kaggle - use these two lines to get BreakHis data!

path = kagglehub.dataset_download("ambarish/breakhis")
print("Path to dataset files:", path)


  from .autonotebook import tqdm as notebook_tqdm
2025-03-25 01:11:09.497333: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


11.1.0
Path to dataset files: /rds/general/user/js4124/home/.cache/kagglehub/datasets/ambarish/breakhis/versions/4


In [2]:
# data_dir = "../data/versions/4/BreaKHis_v1/BreaKHis_v1/histology_slides/breast"
data_dir = "../../.cache/kagglehub/datasets/ambarish/breakhis/versions/4"
metadata = []
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".png"):
            # Extract label from the folder structure
            label = "malignant" if "malignant" in root else "benign"
            
            # Extract magnification 
            
            magnification = None
            for part in root.split(os.sep):
                if part.endswith("X") and part[:-1].isdigit(): 
                    # magnification = part # turn to int
                    magnification = int(part[:-1])
                    break
            
            # Extract tumor subtype 
            tumor_subtype = None
            for part in root.split(os.sep):
                if part in ["adenosis", "fibroadenoma", "phyllodes_tumor", "tubular_adenoma",  # Benign subtypes
                           "ductal_carcinoma", "lobular_carcinoma", "mucinous_carcinoma", "papillary_carcinoma"]:  # Malignant subtypes
                    tumor_subtype = part
                    break
            
            # Append filepath, label, magnification, and tumor subtype to metadata
            metadata.append((os.path.join(root, file), label, magnification, tumor_subtype))



In [3]:
# Convert to DataFrame
df = pd.DataFrame(metadata, columns=["filepath", "label", "magnification", "tumor_subtype"])

df.to_csv("../data/metadata.csv")

# Debugging: Check the shape and first few rows of the DataFrame
print(f"DataFrame shape: {df.shape}")
print(df.head())

DataFrame shape: (7909, 4)
                                            filepath   label  magnification  \
0  /rds/general/user/js4124/home/.cache/kagglehub...  benign            100   
1  /rds/general/user/js4124/home/.cache/kagglehub...  benign            100   
2  /rds/general/user/js4124/home/.cache/kagglehub...  benign            100   
3  /rds/general/user/js4124/home/.cache/kagglehub...  benign            100   
4  /rds/general/user/js4124/home/.cache/kagglehub...  benign            100   

     tumor_subtype  
0  tubular_adenoma  
1  tubular_adenoma  
2  tubular_adenoma  
3  tubular_adenoma  
4  tubular_adenoma  


In [4]:
# df_small = pd.concat([df[df['label'] == 'benign'].sample(3, random_state=42),
#                         df[df['label'] != 'benign'].sample(7, random_state=42)])
# df_small

Unnamed: 0,filepath,label,magnification,tumor_subtype
767,/rds/general/user/js4124/home/.cache/kagglehub...,benign,200,phyllodes_tumor
259,/rds/general/user/js4124/home/.cache/kagglehub...,benign,100,tubular_adenoma
1068,/rds/general/user/js4124/home/.cache/kagglehub...,benign,400,fibroadenoma
2559,/rds/general/user/js4124/home/.cache/kagglehub...,malignant,100,papillary_carcinoma
5641,/rds/general/user/js4124/home/.cache/kagglehub...,malignant,200,ductal_carcinoma
4123,/rds/general/user/js4124/home/.cache/kagglehub...,malignant,400,ductal_carcinoma
2647,/rds/general/user/js4124/home/.cache/kagglehub...,malignant,100,papillary_carcinoma
2919,/rds/general/user/js4124/home/.cache/kagglehub...,malignant,100,papillary_carcinoma
5011,/rds/general/user/js4124/home/.cache/kagglehub...,malignant,40,ductal_carcinoma
3901,/rds/general/user/js4124/home/.cache/kagglehub...,malignant,100,ductal_carcinoma


In [11]:
def augment_and_normalize_images(df, output_folder, target_size=(150, 150)):
    # Create an image data generator with augmentation : rotation and flip
    datagen = tf.keras.preprocessing.image.ImageDataGenerator(rotation_range=90, horizontal_flip=True)

    for idx, row in df.iterrows():
        filepath, label = row["filepath"], row["label"]
        
        image = cv2.imread(filepath)  # Read image
        if image is None:
            print(f"Could not load: {filepath}")
            continue
        
        image = cv2.resize(image, target_size)  # Resize
        # Convert BGR to RGB because OpenCV loads in BGR and TensorFlow needs RGB
        x = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  
        x = x.astype('float32') / 255.0  # Normalize
        x = x.reshape((1,) + x.shape)  # Reshape for ImageDataGenerator
        
        image = image.astype('float32') / 255.0  # Normalize
        save_path = os.path.join(output_folder, f"original_{idx}.png")

        # save normalized original
        normalized_image = (x[0] * 255).astype(np.uint8)  # Convert back to uint8 for saving
        cv2.imwrite(save_path, cv2.cvtColor(normalized_image, cv2.COLOR_RGB2BGR))
        
        if label == "malignant":
            continue
        
        # generate one augmented image per original
        for i, batch in enumerate(datagen.flow(x, batch_size=1, save_to_dir=output_folder, save_prefix=f'augmented_{idx}', save_format='png')):
            if i == 0:
                break
                

In [12]:
augment_and_normalize_images(df, "../data/data_aug")