# Breast Cancer modules using ML
**By: Mustafa Adil**


in this project will use augmented breakHIS dataset 51.2k images overall balanced with 1600 image per subtype of the 8 subtypes in each zoom of the 4 zoom parameters. Only th 40X zoom will be used in this code so the number of images will be 6400 images for each malignant and benign type.

# 1. Define Libraries

In [None]:
# Core Python libraries
import os
import time
import shutil
import pathlib
import itertools
import tempfile

# Data manipulation and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Image processing
from PIL import Image
import cv2
from skimage.feature import graycomatrix, graycoprops

# Machine Learning models and preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

# Machine Learning classifiers
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
    ExtraTreesClassifier, BaggingClassifier, StackingClassifier,
    VotingClassifier, HistGradientBoostingClassifier
)
from sklearn.svm import SVC
from sklearn.linear_model import (
    LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier,
    Perceptron
)
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import (
    GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
)
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.dummy import DummyClassifier
from sklearn.multiclass import OneVsRestClassifier

# External libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier

# ML Evaluation metrics
from sklearn.metrics import (
    confusion_matrix, accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, jaccard_score, matthews_corrcoef,
    cohen_kappa_score, roc_curve, classification_report
)

# Deep Learning (TensorFlow/Keras)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras.optimizers import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, Flatten, Dense, Activation, Dropout, 
    BatchNormalization
)
from tensorflow.keras.callbacks import *
from tensorflow.keras import regularizers
from tensorflow.keras.metrics import *

# Word Document handling
from docx import Document
from docx.shared import Inches

# IPython display utilities
from IPython.display import display, HTML

In [None]:
sns.set_theme(style='darkgrid', palette='pastel')
color = sns.color_palette(palette='pastel')

# 2. Load images from the dataset
in this part will do the following
* only the 40X zoom will be selected
* each file path will be stored in the variable ***filepaths*** and the crosponding label in the variable ***labels*** in the same index.
* create a data frame ***df*** to work like excel sheet with columnes filepath and label.
* show sample of the dataframe.

In [None]:
import numpy as np
import pywt
from skimage.feature import graycomatrix, graycoprops
from tqdm import tqdm  # For progress bar

def compute_glcm_features(image):
    distances = [1, 2, 3, 4]  # You can experiment with different distances
    angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]  # You can experiment with different angles
    
    # Apply 2D Discrete Wavelet Transform (DWT)
    coeffs = pywt.wavedec2(image, 'db1', level=2)  # Decomposition up to 2 levels
    cA, (cH1, cV1, cD1), (cH2, cV2, cD2) = coeffs  # Extract subbands
    
    # Function to compute GLCM and extract features
    def extract_glcm_features(subband):
        glcm = graycomatrix(subband, distances=distances, angles=angles, levels=256, symmetric=True, normed=True)
        contrast = graycoprops(glcm, 'contrast').ravel()
        energy = graycoprops(glcm, 'energy').ravel()
        homogeneity = graycoprops(glcm, 'homogeneity').ravel()
        correlation = graycoprops(glcm, 'correlation').ravel()
        entropy = -np.sum(glcm * np.log(glcm + 1e-10))  # Compute entropy manually
        return np.concatenate((contrast, energy, homogeneity, correlation, [entropy]))
    
    # Extract GLCM features for each wavelet subband
    features = []
    for subband in [cA, cH1, cV1, cD1, cH2, cV2, cD2]:
        features.append(extract_glcm_features(subband.astype(np.uint8)))  # Convert float to uint8 for GLCM
    
    return np.concatenate(features)  # Concatenate all extracted features


def process_one_image(path):
    image = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    return compute_glcm_features(image)
    
def load_and_compute_glcm_features(paths):
    features = []
    total = len(paths)
    for i, path in enumerate(tqdm(paths, desc="Processing Images", unit="img")):
        image = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        dwt_features = compute_glcm_features(image)
        features.append(dwt_features)
    return np.array(features)

from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np

def par_load_and_compute_glcm_features(paths, n_jobs=-1):
    # n_jobs=-1 uses all available CPU cores
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_one_image)(p) for p in tqdm(paths, desc="Processing Images", unit="img")
    )
    return np.vstack(results)

In [None]:

import os
import pandas as pd

# Paths
csv_path  = "E:/BreaKHis_augmentedv2/aug_manifest_filled.csv"
base_dir  = "E:/BreaKHis_augmentedv2"

# 1) Read the CSV
df = pd.read_csv(csv_path)
print(df.head())
# 2) Keep only fold 1
#   Change 'fold' to the actual column name if it is different (e.g., 'Fold', 'kfold')
df_fold1 = df[df["fold"] == 1].copy()

# 3) Prepend base_dir to the 'filename' column
df_fold1["filename"] = df_fold1["filename"].apply(
    lambda x: os.path.join(base_dir, str(x).lstrip("/"))
)
# 4) Prepare paths and labels
paths  = df_fold1["filename"].tolist()

#   Adjust the label column name if needed (e.g., 'class', 'target')
# labels = df_fold1["label"].to_numpy()

# 5) Compute GLCM features
features = par_load_and_compute_glcm_features(paths)

# 'features' corresponds to GLCM features per image in 'paths'
# 'labels' are the ground-truth labels for those images


In [None]:
import numpy as np

distances = [1, 2, 3, 4]
angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]
angle_labels = [0, 45, 90, 135]   # for readable names

subbands = ["cA", "cH1", "cV1", "cD1", "cH2", "cV2", "cD2"]
props = ["contrast", "energy", "homogeneity", "correlation"]

feature_cols = []

for sb in subbands:
    for prop in props:
        # graycoprops output is shape (len(distances), len(angles))
        # .ravel() flattens with distance index first, then angle index
        for d in distances:
            for ang_deg in angle_labels:
                feature_cols.append(f"{sb}_{prop}_d{d}_a{ang_deg}")
    # one entropy per subband
    feature_cols.append(f"{sb}_entropy")

len(feature_cols)  # should be 455

df_features = pd.DataFrame(features, columns=feature_cols)
df = pd.concat(
    [pd.Series(paths, name="path"), df_features],
    axis=1
)

df.to_csv("glcm_features_fold1.csv", index=False)

# Save to Excel
df.to_excel("glcm_features_fold1.xlsx", index=False)
