In [1]:
import kagglehub
import os
import numpy as np
import pandas as pd
import cv2
import tensorflow as tf
import PIL
import glob
print(PIL.__version__)  # Check if it’s installed correctly
from sklearn.model_selection import train_test_split
# Get data from kaggle - use these two lines to get BreakHis data!

path = kagglehub.dataset_download("ambarish/breakhis")
print("Path to dataset files:", path)


  from .autonotebook import tqdm as notebook_tqdm
2025-03-26 23:09:17.115786: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


11.1.0
Path to dataset files: /rds/general/user/js4124/home/.cache/kagglehub/datasets/ambarish/breakhis/versions/4


In [2]:
# data_dir = "../data/versions/4/BreaKHis_v1/BreaKHis_v1/histology_slides/breast"
data_dir = "../../.cache/kagglehub/datasets/ambarish/breakhis/versions/4"
metadata = []
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith(".png"):
            # Extract label from the folder structure
            label = "malignant" if "malignant" in root else "benign"
            
            # Extract magnification 
            
            magnification = None
            for part in root.split(os.sep):
                if part.endswith("X") and part[:-1].isdigit(): 
                    # magnification = part # turn to int
                    magnification = int(part[:-1])
                    break
            
            # Extract tumor subtype 
            tumor_subtype = None
            for part in root.split(os.sep):
                if part in ["adenosis", "fibroadenoma", "phyllodes_tumor", "tubular_adenoma",  # Benign subtypes
                           "ductal_carcinoma", "lobular_carcinoma", "mucinous_carcinoma", "papillary_carcinoma"]:  # Malignant subtypes
                    tumor_subtype = part
                    break
            
            # Append filepath, label, magnification, and tumor subtype to metadata
            metadata.append((os.path.join(root, file), label, magnification, tumor_subtype))



In [3]:
# Convert to DataFrame
df = pd.DataFrame(metadata, columns=["filepath", "label", "magnification", "tumor_subtype"])
malignant_df = df[df["label"] == "malignant"]

df.to_csv("../data/metadata.csv")

# Debugging: Check the shape and first few rows of the DataFrame
print(f"DataFrame shape: {df.shape}")
print(df.head())

DataFrame shape: (7909, 4)
                                            filepath   label  magnification  \
0  ../../.cache/kagglehub/datasets/ambarish/break...  benign            100   
1  ../../.cache/kagglehub/datasets/ambarish/break...  benign            100   
2  ../../.cache/kagglehub/datasets/ambarish/break...  benign            100   
3  ../../.cache/kagglehub/datasets/ambarish/break...  benign            100   
4  ../../.cache/kagglehub/datasets/ambarish/break...  benign            100   

     tumor_subtype  
0  tubular_adenoma  
1  tubular_adenoma  
2  tubular_adenoma  
3  tubular_adenoma  
4  tubular_adenoma  


In [4]:
train, hold_out = train_test_split(df, test_size=0.25, random_state=42)

train, test = train_test_split(train, test_size=0.25, random_state=42)

print("Train: ", train.shape)
print("Test: ", test.shape)
print("Holdout: ", hold_out.shape)

train.to_csv("../data/train_1.csv", index=False)
test.to_csv("../data/test_1.csv", index=False)
hold_out.to_csv("../data/holdout_1.csv", index=False)

Train:  (4448, 4)
Test:  (1483, 4)
Holdout:  (1978, 4)


In [5]:
# df_small = pd.concat([df[df['label'] == 'benign'].sample(3, random_state=42),
#                         df[df['label'] != 'benign'].sample(7, random_state=42)])
# df_small

In [6]:
def augment_and_normalize_images(df, output_folder, target_size=(150, 150)):
    # Create an image data generator with augmentation : rotation and flip
    datagen = tf.keras.preprocessing.image.ImageDataGenerator(rotation_range=90, horizontal_flip=True)
    # each item of the dict is a column not a dataframe
    aug_dict = {
        "filepath": [],
        "label": [],
        "magnification": [],
        "tumor_subtype": []
    }

    for idx, row in df.iterrows():
        filepath, label, mag, subtype = row["filepath"], row["label"], row["magnification"], row["tumor_subtype"]
        
        image = cv2.imread(filepath)  # Read image
        if image is None:
            print(f"Could not load: {filepath}")
            continue
        
        image = cv2.resize(image, target_size)  # Resize
        # Convert BGR to RGB because OpenCV loads in BGR and TensorFlow needs RGB
        x = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  
        x = x.astype('float32') / 255.0  # Normalize
        x = x.reshape((1,) + x.shape)  # Reshape for ImageDataGenerator
        
        image = image.astype('float32') / 255.0  # Normalize
        save_path_org = os.path.join(output_folder, f"original_{idx}.png")

        # save normalized original
        normalized_image = (x[0] * 255).astype(np.uint8)  # Convert back to uint8 for saving
        cv2.imwrite(save_path_org, cv2.cvtColor(normalized_image, cv2.COLOR_RGB2BGR))
        
        # add the original row in the dict of list (use dict for better efficiency than dataframe)
        aug_dict["filepath"].append(save_path_org)
        aug_dict["label"].append(label)
        aug_dict["magnification"].append(mag)
        aug_dict["tumor_subtype"].append(subtype)
        
        if label == "malignant":
            continue
        
        # generate one augmented image per original
        for i, batch in enumerate(datagen.flow(x, batch_size=1, save_to_dir=output_folder, save_prefix=f'augmented_{idx}', save_format='png')):
            save_path_aug = max(glob.glob(f'{output_folder}/augmented_{idx}*.png'), key=os.path.getctime)
            # add the augmented row in the dict of list (use dict for better efficiency than dataframe)
            aug_dict["filepath"].append(save_path_aug)
            aug_dict["label"].append(label)
            aug_dict["magnification"].append(mag)
            aug_dict["tumor_subtype"].append(subtype)
            if i == 0:
                break
    aug_df = pd.DataFrame(aug_dict)
    return aug_df
    
                

In [9]:
aug_train_df= augment_and_normalize_images(train, "../train_aug")

1) create augmented_df outside of the loop, at the beginning of the function
2) in augmented df, write the full absolute path : for original files it is easy, for augmented images retrieve the path from the loop before the break
3) return augmented_df at the end of the function
4) call the function and save the return value in a csv

In [10]:
aug_train_df.to_csv("../data/augmented_train_dataset.csv", index=False)

In [11]:
test.info

<bound method DataFrame.info of                                                filepath      label  \
658   ../../.cache/kagglehub/datasets/ambarish/break...     benign   
4630  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
1274  ../../.cache/kagglehub/datasets/ambarish/break...     benign   
4849  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
7291  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
...                                                 ...        ...   
5731  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
2985  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
72    ../../.cache/kagglehub/datasets/ambarish/break...     benign   
5609  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
7638  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   

      magnification        tumor_subtype  
658             100      phyllodes_tumor  
4630            100     ductal_carcinoma 

In [13]:
hold_out.info

<bound method DataFrame.info of                                                filepath      label  \
2815  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
5532  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
4553  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
3438  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
3023  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
...                                                 ...        ...   
6354  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
7308  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
5234  ../../.cache/kagglehub/datasets/ambarish/break...  malignant   
2251  ../../.cache/kagglehub/datasets/ambarish/break...     benign   
964   ../../.cache/kagglehub/datasets/ambarish/break...     benign   

      magnification        tumor_subtype  
2815             40  papillary_carcinoma  
5532            200     ductal_carcinoma 

In [15]:
aug_train_df.info

<bound method DataFrame.info of                                     filepath      label  magnification  \
0             ../train_aug/original_2621.png  malignant            400   
1             ../train_aug/original_4985.png  malignant            200   
2             ../train_aug/original_3990.png  malignant            200   
3             ../train_aug/original_2934.png  malignant            100   
4             ../train_aug/original_4068.png  malignant            200   
...                                      ...        ...            ...   
5815  ../train_aug/augmented_1365_0_5546.png     benign            400   
5816           ../train_aug/original_728.png     benign            400   
5817   ../train_aug/augmented_728_0_2283.png     benign            400   
5818          ../train_aug/original_6448.png  malignant            100   
5819          ../train_aug/original_7531.png  malignant            100   

            tumor_subtype  
0     papillary_carcinoma  
1        ductal_carcino

In [8]:
pwd

'/rds/general/user/js4124/home/ML_BreakHis/scr'