In [5]:
import zipfile
import os

ZIP_PATH = "BreakHis.zip"
EXTRACT_PATH = "BreakHis"

if not os.path.exists(EXTRACT_PATH):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_PATH)

print("Dataset extracted and ready for use")


Dataset extracted and ready for use


In [8]:
import os
import shutil
import pandas as pd

# Base working directory
BASE_DIR = "BreakHis"

# Path to the original dataset
original_dataset= os.path.join(
    BASE_DIR,
    "BreaKHis_v1",
    "BreaKHis_v1",
    "histology_slides",
    "breast"
)

# Path where mixed images will be stored
images= os.path.join(BASE_DIR, "mixed_images")

# Create mixed_images directory if it does not exist
os.makedirs(images, exist_ok=True)

print("Original dataset path:")
print(original_dataset)

print("\nMixed images path:")
print(images)


Original dataset path:
BreakHis\BreaKHis_v1\BreaKHis_v1\histology_slides\breast

Mixed images path:
BreakHis\mixed_images


In [9]:
image_records = []
image_counter = 0




for class_name in sorted(os.listdir(original_dataset)):  # benign / malignant
    class_path = os.path.join(original_dataset, class_name)

    if not os.path.isdir(class_path):
        continue

    for protocol in sorted(os.listdir(class_path)):  # SOB
        protocol_path = os.path.join(class_path, protocol)

        if not os.path.isdir(protocol_path):
            continue

        for tumor_type in sorted(os.listdir(protocol_path)):  # adenosis, ductal_carcinoma
            tumor_type_path = os.path.join(protocol_path, tumor_type)

            if not os.path.isdir(tumor_type_path):
                continue

            for patient_folder in sorted(os.listdir(tumor_type_path)):  # SOB_B_A_xxxx
                patient_path = os.path.join(tumor_type_path, patient_folder)

                if not os.path.isdir(patient_path):
                    continue

                for magnification in sorted(os.listdir(patient_path)):  # 40X,100X,...
                    magnification_path = os.path.join(patient_path, magnification)

                    if not os.path.isdir(magnification_path):
                        continue

                    for image_name in sorted(os.listdir(magnification_path)):
                        if image_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                            image_counter += 1
                            new_image_name = f"img_{image_counter:06d}.png"

                            src_path = os.path.join(magnification_path, image_name)
                            dst_path = os.path.join(images, new_image_name)

                            shutil.copy(src_path, dst_path)

                            image_records.append({
                                "image_name": new_image_name,
                                "class": class_name,         # benign / malignant
                                "tumor_type": tumor_type,    # adenosis, DC, LC, etc
                                "magnification": magnification
                            })


In [14]:
# Create DataFrame from image records
labels_df = pd.DataFrame(image_records)

# Save labels CSV inside the working dataset folder
labels_csv_path = os.path.join("BreakHis", "labels.csv")
labels_df.to_csv(labels_csv_path, index=False)

# Print summary
print(f"Total images copied: {image_counter}")




Total images copied: 7909


In [13]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras import layers, models, optimizers
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


In [16]:
labels_df = pd.read_csv(labels_csv_path)
labels_df.head()

Unnamed: 0,image_name,class,tumor_type,magnification
0,img_000001.png,benign,adenosis,100X
1,img_000002.png,benign,adenosis,100X
2,img_000003.png,benign,adenosis,100X
3,img_000004.png,benign,adenosis,100X
4,img_000005.png,benign,adenosis,100X


In [17]:
label_map = {"benign": 0, "malignant": 1}
labels_df["label"] = labels_df["class"].map(label_map)

labels_df["label"].value_counts()


label
1    5429
0    2480
Name: count, dtype: int64

In [18]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights_array = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=labels_df["label"].values
)

class_weights = {
    0: class_weights_array[0],  # benign
    1: class_weights_array[1]   # malignant
}

print("Class weights:", class_weights)


Class weights: {0: np.float64(1.5945564516129032), 1: np.float64(0.7284030208141462)}


In [None]:
# from sklearn.model_selection import train_test_split

# train_df, val_df = train_test_split(
#     labels_df,
#     test_size=0.2,
#     stratify=labels_df["label"],
#     random_state=42
# )


In [None]:
import tensorflow as tf
import os

IMG_SIZE = 224
BATCH_SIZE = 16
IMAGES_DIR = "BreakHis/mixed_images"

def load_image(path, label):
    img = tf.io.read_file(path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
    img = tf.cast(img, tf.float32) / 255.0
    return img, label


def build_dataset(df, shuffle=True):
    paths = df["image_name"].apply(
        lambda x: os.path.join(IMAGES_DIR, x)
    ).values

    labels = df["label"].values.astype(np.int32)

    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    ds = ds.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)

    if shuffle:
        ds = ds.shuffle(buffer_size=1000)

    ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return ds


In [None]:
train_ds = build_dataset(train_df, shuffle=True)
val_ds   = build_dataset(val_df, shuffle=False)


In [21]:
from tensorflow.keras.applications import ResNet152
from tensorflow.keras import layers, models

def build_resnet152():
    base = ResNet152(
        include_top=False,
        weights=None,
        input_shape=(224,224,3)
    )

    x = layers.GlobalAveragePooling2D()(base.output)
    features = layers.Dense(256, activation="relu", name="feature_vector")(x)
    output = layers.Dense(2, activation="softmax")(features)

    model = models.Model(base.input, output)
    return model


In [22]:
from tensorflow.keras.applications import VGG19

def build_vgg19():
    base = VGG19(
        include_top=False,
        weights=None,
        input_shape=(224,224,3)
    )

    x = layers.Flatten()(base.output)
    features = layers.Dense(256, activation="relu", name="feature_vector")(x)
    output = layers.Dense(2, activation="softmax")(features)

    model = models.Model(base.input, output)
    return model


In [23]:
from tensorflow.keras.applications import Xception

def build_xception():
    base = Xception(
        include_top=False,
        weights=None,
        input_shape=(224,224,3)
    )

    x = layers.GlobalAveragePooling2D()(base.output)
    features = layers.Dense(256, activation="relu", name="feature_vector")(x)
    output = layers.Dense(2, activation="softmax")(features)

    model = models.Model(base.input, output)
    return model


In [None]:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras import layers, models

def build_efficientnetb0():
    base = EfficientNetB0(
        include_top=False,
        weights=None,
        input_shape=(224, 224, 3)
    )

    x = layers.GlobalAveragePooling2D()(base.output)
    features = layers.Dense(
        256, activation="relu", name="feature_vector"
    )(x)
    output = layers.Dense(2, activation="softmax")(features)

    model = models.Model(base.input, output)
    return model


In [25]:
from tensorflow.keras.applications import MobileNetV3Large

def build_mobilenetv3():
    base = MobileNetV3Large(
        include_top=False,
        weights=None,
        input_shape=(224,224,3)
    )

    x = layers.GlobalAveragePooling2D()(base.output)
    features = layers.Dense(256, activation="relu", name="feature_vector")(x)
    output = layers.Dense(2, activation="softmax")(features)

    model = models.Model(base.input, output)
    return model


In [26]:
from tensorflow.keras.optimizers import Adam

def train_model(model, name):
    print(f"\nTraining {name}")
    model.compile(
        optimizer=Adam(1e-4),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    model.summary()  # REQUIRED by instructions

    model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=10,
        class_weight=class_weights
    )

    feature_extractor = tf.keras.Model(
        inputs=model.input,
        outputs=model.get_layer("feature_vector").output
    )

    return feature_extractor


In [None]:
feature_extractors = {}

feature_extractors["ResNet152"]  = train_model(build_resnet152(), "ResNet152")



In [None]:
feature_extractors["EfficientNetB0"]    = train_model(build_efficientnetb0(), "EfficientNetB0")

In [None]:
feature_extractors["VGG19"]      = train_model(build_vgg19(), "VGG19")

In [None]:
feature_extractors["Xception"]   = train_model(build_xception(), "Xception")

In [None]:
feature_extractors["MobileNetV3"]= train_model(build_mobilenetv3(), "MobileNetV3")

In [None]:
full_ds = build_dataset(labels_df, shuffle=False)

FEATURE_DIR = "saved_features"
os.makedirs(FEATURE_DIR, exist_ok=True)

feature_vectors = {}

for name, extractor in feature_extractors.items():
    features = extractor.predict(full_ds)
    feature_vectors[name] = features

    np.save(
        os.path.join(FEATURE_DIR, f"{name}_features.npy"),
        features
    )

    print(f"✅ {name} features saved:", features.shape)



In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd


In [None]:
def run_kmeans_and_label_change(features, true_labels, k=2):
    """
    features     : feature matrix (N x D)
    true_labels  : ground-truth labels (0/1)
    k            : number of clusters
    """

    # Normalize features (VERY IMPORTANT for K-Means)
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)


    from sklearn.decomposition import PCA

    pca = PCA(n_components=0.95, random_state=42)  # keep 95% variance
    features_pca = pca.fit_transform(features_scaled)

    print("Original dim:", features.shape[1])
    print("Reduced dim:", features_pca.shape[1])


    # Apply K-Means
    kmeans = KMeans(
        n_clusters=k,
        random_state=42,
        n_init=10
    )
    cluster_labels = kmeans.fit_predict(features_pca)

    # Because cluster IDs are arbitrary, align clusters to true labels
    df_temp = pd.DataFrame({
        "true": true_labels,
        "cluster": cluster_labels
    })

    mapping = {}
    for c in np.unique(cluster_labels):
        majority_label = df_temp[df_temp["cluster"] == c]["true"].mode()[0]
        mapping[c] = majority_label

    mapped_clusters = np.array([mapping[c] for c in cluster_labels])

    # Label change calculation
    changed = mapped_clusters != true_labels
    change_percentage = changed.mean() * 100

    return change_percentage, changed, mapped_clusters


In [None]:
results = {}

true_labels = labels_df["label"].values

import os, numpy as np

LABEL_DIR = "saved_labels"
os.makedirs(LABEL_DIR, exist_ok=True)

np.save(os.path.join(LABEL_DIR, "true_labels.npy"), true_labels)
print("✅ True labels saved for future clustering")


for model_name, features in feature_vectors.items():
    change_pct, changed_flags, mapped_clusters = run_kmeans_and_label_change(
        features,
        true_labels,
        k=2
    )

    results[model_name] = {
        "change_percentage": change_pct,
        "changed_flags": changed_flags,
        "mapped_clusters": mapped_clusters
    }

    print(f"{model_name} → Label change percentage: {change_pct:.2f}%")


In [None]:
summary_df = pd.DataFrame({
    "Model": results.keys(),
    "Label Change (%)": [results[m]["change_percentage"] for m in results]
})

summary_df.sort_values("Label Change (%)")
summary_df.to_csv("model_comparison_label_change.csv", index=False)
