<a href="https://colab.research.google.com/github/nehanataraj/breastcancer-detection/blob/main/breastcancer-detectionSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Library imports
import os
import cv2
import numpy as np
import random
import zipfile
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# File uploads; files are stored within subfolders labeled "Male" and "Female"
uploaded = files.upload()
for fname in uploaded.keys():
    with zipfile.ZipFile(fname, 'r') as zip_ref:
        zip_ref.extractall("/content")

base_dir = "/content/breastcancerdata"
mbc_dir = os.path.join(base_dir, "Male")
fbc_dir = os.path.join(base_dir, "Female")

# Load MBC mammograms
def load_mbc_images(folder, target_size=(128, 128)):
    images, labels = [], []
    for fname in os.listdir(folder):
        if fname.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
            label = 0 if fname.lower().startswith("benign") else 1
            img_path = os.path.join(folder, fname)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if img is not None:
                img = cv2.resize(img, target_size)
                images.append(img)
                labels.append(label)
    return np.array(images), np.array(labels)

X_mbc_orig, y_mbc_orig = load_mbc_images(mbc_dir)
print(f"Loaded {len(X_mbc_orig)} male images")

# Augmenting MBC mammograms to 50 images
data_gen = ImageDataGenerator(rotation_range=30, horizontal_flip=True, brightness_range=[0.8, 1.2])
X_mbc_augmented = list(X_mbc_orig)
y_mbc_augmented = list(y_mbc_orig)
i = 0
while len(X_mbc_augmented) < 50:
    img = X_mbc_orig[i % len(X_mbc_orig)].reshape((1, 128, 128, 1))
    label = y_mbc_orig[i % len(y_mbc_orig)]
    aug_img = next(data_gen.flow(img, batch_size=1))[0].reshape(128, 128)
    X_mbc_augmented.append(aug_img)
    y_mbc_augmented.append(label)
    i += 1

X_mbc = np.array(X_mbc_augmented)
y_mbc = np.array(y_mbc_augmented)

# Load 50 images from each "Female" subfolder
def load_fbc_subset(folder, max_per_class=50, target_size=(128, 128)):
    images, labels = [], []
    for subfolder in sorted(os.listdir(folder)):
        path = os.path.join(folder, subfolder)
        if os.path.isdir(path):
            class_images = []
            for fname in os.listdir(path):
                if fname.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
                    img_path = os.path.join(path, fname)
                    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                    if img is not None:
                        img = cv2.resize(img, target_size)
                        class_images.append(img)
            label = 0 if 'Benign' in subfolder else 1
            selected = random.sample(class_images, min(max_per_class, len(class_images)))
            images.extend(selected)
            labels.extend([label] * len(selected))
    return np.array(images), np.array(labels)

X_fbc, y_fbc = load_fbc_subset(fbc_dir)
print(f"Loaded {len(X_fbc)} female images across 8 subfolders")

# Define ratios for testing
ratios = {
    "FBC Only": (0, 400),
    "1:41": (10, 410),
    "1:21": (20, 420),
    "3:43": (30, 430),
    "1:11": (40, 440),
    "1:9":  (50, 450),
}

results = []

# Run SVM on each ratio
for label, (num_mbc, total) in ratios.items():
    if num_mbc == 0:
        X_total = X_fbc
        y_total = y_fbc
    else:
        X_mbc_subset, y_mbc_subset = X_mbc[:num_mbc], y_mbc[:num_mbc]
        X_fbc_subset, y_fbc_subset = X_fbc[:total - num_mbc], y_fbc[:total - num_mbc]
        X_total = np.vstack((X_mbc_subset, X_fbc_subset))
        y_total = np.hstack((y_mbc_subset, y_fbc_subset))

    # Shuffle and normalize
    idx = np.random.permutation(len(X_total))
    X_total, y_total = X_total[idx], y_total[idx]
    X_total = X_total / 255.0
    X_total_flat = X_total.reshape(len(X_total), -1)

    # 70-20-10 split
    X_temp, X_val, y_temp, y_val = train_test_split(X_total_flat, y_total, test_size=0.1, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X_temp, y_temp, test_size=2/9, random_state=42)

    clf = SVC(kernel='linear')
    clf.fit(X_train, y_train)

    y_pred_val = clf.predict(X_val)

    # Accuracy calculations
    overall_acc = accuracy_score(y_val, y_pred_val)

    if num_mbc > 0:
        mbc_mask = y_val[:len(y_val)] == 1  # Malignant is 1 (MBC or malignant FBC)
        mbc_val_true = y_val[mbc_mask]
        mbc_val_pred = y_pred_val[mbc_mask]
        mbc_acc = accuracy_score(mbc_val_true, mbc_val_pred) if len(mbc_val_true) > 0 else None

        fbc_mask = y_val[:len(y_val)] == 0
        fbc_val_true = y_val[fbc_mask]
        fbc_val_pred = y_pred_val[fbc_mask]
        fbc_acc = accuracy_score(fbc_val_true, fbc_val_pred) if len(fbc_val_true) > 0 else None
    else:
        mbc_acc = None
        fbc_acc = accuracy_score(y_val, y_pred_val)

    results.append({
        "Ratio": label,
        "MBC Accuracy": mbc_acc if mbc_acc is not None else "N/A",
        "FBC Accuracy": fbc_acc if fbc_acc is not None else "N/A",
        "Overall Accuracy": overall_acc
    })

# Return the final output as Table (Figure 2)
df = pd.DataFrame(results)
print(df)


In [None]:
from google.colab import drive
drive.mount('/content/drive')


MessageError: Error: credential propagation was unsuccessful