In [None]:
def extract_mc_features(mc_contours, tophat_norm):
    """
    Extract features from microcalcifications (MCs).
    Safely handles empty or single MC cases.
    """
    # Initialize all features with default values
    mc_count = len(mc_contours)
    mc_avg_area = 0.0
    mc_std_area = 0.0
    mc_density = 0.0      # always initialized
    mc_mean_int = 0.0

    if mc_count > 0:
        areas = [cv2.contourArea(c) for c in mc_contours]
        mc_avg_area = float(np.mean(areas))
        mc_std_area = float(np.std(areas))

        # --- cluster density ---
        if mc_count > 1:
            xs, ys = [], []
            for c in mc_contours:
                x, y, w, h = cv2.boundingRect(c)
                xs.extend([x, x + w])
                ys.extend([y, y + h])
            cluster_area = (max(xs) - min(xs)) * (max(ys) - min(ys))
            mc_density = mc_count / (cluster_area + 1e-6)
        else:
            mc_density = 1.0  # assign default for single MC

        # --- average intensity of MCs ---
        mask = np.zeros_like(tophat_norm, dtype=np.uint8)
        for c in mc_contours:
            cv2.drawContours(mask, [c], -1, 255, -1)

        vals = tophat_norm[mask == 255]
        if len(vals) > 0:
            mc_mean_int = float(np.mean(vals))

    # Return feature vector
    mc_feature_vector = [
        mc_count,
        mc_avg_area,
        mc_std_area,
        mc_density,
        mc_mean_int
    ]

    return mc_feature_vector

def build_master_vector(mass_features_df, mc_features_list):
    """
    Combine mass features (DataFrame) with MC features (list)
    into a single flat vector (list or Series)
    """
    # Convert DataFrame row to list
    mass_features_list = mass_features_df.iloc[0].tolist()  # first row
    # Combine
    master_vector = mass_features_list + mc_features_list
    return master_vector


In [53]:
import pandas as pd
import cv2
import numpy as np

from Feature_Extraction import (
    extract_mass_features)

from input import get_mass_data


# Load original dataset
df = pd.read_csv(
    r"E:\Breast_Cancer_Detection_Imp\test_dataset.csv"
)

all_rows = []
import time

start_total = time.time()

for idx, row in df.iterrows():
    start = time.time()

    img_path = row['PATH']
    label    = row['TYPE']   # or CLASS

    # Load image
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        print(f"Could not read {img_path}")
        continue

    # PREPROCESSING 
    mass_region, clean_mask = get_mass_data(img)

    if mass_region is None or clean_mask is None:
        print(f"No mass detected in {img_path}")
        continue

    # MASS FEATURES
    mass_features = extract_mass_features(
        mass_region_gray=mass_region,
        clean_mask=clean_mask
    )

    # MC PREPROCESSING 
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (9, 9))
    tophat = cv2.morphologyEx(mass_region, cv2.MORPH_TOPHAT, kernel)

    _, mc_bin = cv2.threshold(tophat, 200, 255, cv2.THRESH_BINARY)

    mc_contours, _ = cv2.findContours(
        mc_bin,
        cv2.RETR_EXTERNAL,
        cv2.CHAIN_APPROX_SIMPLE
    )

    # MC FEATURES
    mc_features = extract_mc_features(mc_contours, tophat)

    # MASTER FEATURE VECTOR
    master_vector = build_master_vector(mass_features, mc_features)
    master_vector.append(label)

    all_rows.append(master_vector)

    end = time.time()
    print(f"Row {idx+1} processed in {end-start:.2f} seconds")

print(f"Total time: {time.time()-start_total:.2f} seconds")


# Save final dataset
FEATURE_COLUMNS = [
    'Area','Perimeter','Circularity','Eccentricity','Solidity','Extent',
    'Mean_Intensity','Max_Intensity','Std_Intensity',
    'Contrast','Dissimilarity','Homogeneity','Energy','Correlation','ASM',
    'MC_Count','MC_AvgArea','MC_StdArea','MC_Density','MC_MeanIntensity',
    'Label'
]

final_df = pd.DataFrame(all_rows, columns=FEATURE_COLUMNS)
final_df.to_excel("svm_features_test_dataset.xlsx", index=False)


Row 1 processed in 9.15 seconds
Row 2 processed in 4.84 seconds
Row 3 processed in 7.06 seconds
Row 4 processed in 5.58 seconds
Row 5 processed in 15.08 seconds
Row 6 processed in 9.30 seconds
Row 7 processed in 4.74 seconds
Row 8 processed in 6.86 seconds
Row 9 processed in 8.78 seconds
Row 10 processed in 7.21 seconds
Row 11 processed in 7.84 seconds
Row 12 processed in 7.08 seconds
Row 13 processed in 14.25 seconds
Row 14 processed in 6.60 seconds
Row 15 processed in 7.14 seconds
Row 16 processed in 13.63 seconds
Row 17 processed in 8.93 seconds
Row 18 processed in 6.51 seconds
Row 19 processed in 8.40 seconds
Row 20 processed in 6.34 seconds
Row 21 processed in 5.45 seconds
Row 22 processed in 4.52 seconds
Row 23 processed in 9.07 seconds
Row 24 processed in 6.39 seconds
Row 25 processed in 5.52 seconds
Row 26 processed in 6.40 seconds
Row 27 processed in 6.90 seconds
Row 28 processed in 7.32 seconds
Row 29 processed in 6.94 seconds
Row 30 processed in 8.04 seconds
Row 31 processed

In [57]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Load training and test data from separate Excel files
train_data = pd.read_excel(r"E:\Breast_Cancer_Detection_Imp\svm_features_dataset.xlsx")
test_data  = pd.read_excel(r"E:\Breast_Cancer_Detection_Imp\svm_features_test_dataset.xlsx")  # your test file

# Separate features and labels
X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values

X_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1].values

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train SVM
svm_model = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    class_weight='balanced',  # automatically balances minority classes
    decision_function_shape='ovr'
)

svm_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred, zero_division=0))


Accuracy: 0.43478260869565216
                     precision    recall  f1-score   support

               MASS       0.33      0.58      0.42        12
MICRO_CALCIFICATION       0.29      0.67      0.40         6
             NORMAL       0.82      0.32      0.46        28

           accuracy                           0.43        46
          macro avg       0.48      0.52      0.43        46
       weighted avg       0.62      0.43      0.44        46



In [35]:
import os

# Path to your folder containing the .pgm files
folder_path = r"E:\Breast_Cancer_Detection_Imp\Dataset\all-mias"

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pgm"):
        # Split the name and extension
        name, ext = os.path.splitext(filename)
        
        # Remove last 2 letters from the name
        new_name = name[:-2] + ext
        
        # Full paths
        old_path = os.path.join(folder_path, filename)
        new_path = os.path.join(folder_path, new_name)
        
        # Rename the file
        os.rename(old_path, new_path)
        print(f"Renamed: {filename} -> {new_name}")

print("All files renamed successfully.")


Renamed: mdb001lm.pgm -> mdb001.pgm
Renamed: mdb002rl.pgm -> mdb002.pgm
Renamed: mdb003ll.pgm -> mdb003.pgm
Renamed: mdb004rl.pgm -> mdb004.pgm
Renamed: mdb005ll.pgm -> mdb005.pgm
Renamed: mdb006rl.pgm -> mdb006.pgm
Renamed: mdb007ll.pgm -> mdb007.pgm
Renamed: mdb008rl.pgm -> mdb008.pgm
Renamed: mdb009ll.pgm -> mdb009.pgm
Renamed: mdb010rm.pgm -> mdb010.pgm
Renamed: mdb011ll.pgm -> mdb011.pgm
Renamed: mdb012rl.pgm -> mdb012.pgm
Renamed: mdb013ll.pgm -> mdb013.pgm
Renamed: mdb014rl.pgm -> mdb014.pgm
Renamed: mdb015lm.pgm -> mdb015.pgm
Renamed: mdb016rm.pgm -> mdb016.pgm
Renamed: mdb017ls.pgm -> mdb017.pgm
Renamed: mdb018rs.pgm -> mdb018.pgm
Renamed: mdb019ll.pgm -> mdb019.pgm
Renamed: mdb020rl.pgm -> mdb020.pgm
Renamed: mdb021ll.pgm -> mdb021.pgm
Renamed: mdb022rm.pgm -> mdb022.pgm
Renamed: mdb023ll.pgm -> mdb023.pgm
Renamed: mdb024rl.pgm -> mdb024.pgm
Renamed: mdb025ll.pgm -> mdb025.pgm
Renamed: mdb026rl.pgm -> mdb026.pgm
Renamed: mdb027ll.pgm -> mdb027.pgm
Renamed: mdb028rl.pgm -> mdb