In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load dataset from UCI
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
data = pd.read_csv(url, header=None)

# Give column names
columns = ["ID", "Diagnosis"] + [f"attr_{i}" for i in range(30)]
data.columns = columns

# Remove ID as it is not useful
data.drop("ID", axis=1, inplace=True)

# Encode diagnosis as numeric (Benign=0, Malignant=1)
encoder = LabelEncoder()
data["Diagnosis"] = encoder.fit_transform(data["Diagnosis"])

# Separate features and label
X_data = data.drop("Diagnosis", axis=1)
y_data = data["Diagnosis"]

# Scale data (MinMax for chi2, Standard for modeling)
minmax_scaler = MinMaxScaler()
X_minmax_scaled = minmax_scaler.fit_transform(X_data)

std_scaler = StandardScaler()
X_standard_scaled = std_scaler.fit_transform(X_data)

# Create one training-testing split for main part
X_tr, X_te, y_tr, y_te = train_test_split(
    X_standard_scaled, y_data, test_size=0.3, random_state=42, stratify=y_data
)

# Function to select features using given method
def feature_selection(X, y, method_name, k=10):
    if method_name == "chi2":
        selector = SelectKBest(score_func=chi2, k=k)
    elif method_name == "mutual_info":
        selector = SelectKBest(score_func=mutual_info_classif, k=k)
    else:
        raise ValueError("Unknown feature selection method.")
    X_reduced = selector.fit_transform(X, y)
    chosen = selector.get_support(indices=True)
    return X_reduced, chosen

# Function to print evaluation results
def print_results(model_name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mat = confusion_matrix(y_true, y_pred)
    print(f"\nResults for {model_name}:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {pre:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Confusion Matrix:\n{mat}")

# Helper function to fit and test SVM on reduced features
def test_subset(X_new, y, clf, title):
    X_train, X_test, y_train, y_test = train_test_split(
        X_new, y, test_size=0.3, stratify=y, random_state=42
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_results(title, y_test, y_pred)

# Get top 10 features using chi2 and mutual info
X_chi, chi_selected = feature_selection(X_minmax_scaled, y_data, "chi2", k=10)
X_mi, mi_selected = feature_selection(X_standard_scaled, y_data, "mutual_info", k=10)

print("Top 10 Chi2 feature indices:", chi_selected)
print("Top 10 Mutual Info feature indices:", mi_selected)

# Train SVMs with RBF kernel
test_subset(X_chi, y_data, SVC(kernel='rbf', random_state=42), "SVM + Chi2")
test_subset(X_mi, y_data, SVC(kernel='rbf', random_state=42), "SVM + Mutual Info")

# Compare different top feature counts
print("Comparing feature sizes (Chi2):")
for n in [5, 10, 15, 20]:
    X_temp, feat_ids = feature_selection(X_minmax_scaled, y_data, "chi2", k=n)
    test_subset(X_temp, y_data, SVC(kernel='rbf', random_state=42), f"SVM + Chi2 (Top {n})")

print("Comparing feature sizes (Mutual Info):")
for n in [5, 10, 15, 20]:
    X_temp, feat_ids = feature_selection(X_standard_scaled, y_data, "mutual_info", k=n)
    test_subset(X_temp, y_data, SVC(kernel='rbf', random_state=42), f"SVM + MI (Top {n})")

# Function to calculate sensitivity, specificity, PPV, NPV
def calc_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    sens = tp / (tp + fn) * 100 if (tp + fn) > 0 else 0
    spec = tn / (tn + fp) * 100 if (tn + fp) > 0 else 0
    ppv = tp / (tp + fp) * 100 if (tp + fp) > 0 else 0
    npv = tn / (tn + fn) * 100 if (tn + fn) > 0 else 0
    return sens, spec, ppv, npv, cm

# Pick top 10 chi2 features for next evaluation
chi_selector = SelectKBest(score_func=chi2, k=10)
X_selected = chi_selector.fit_transform(X_minmax_scaled, y_data)
chosen_feats = chi_selector.get_support(indices=True)
print("Selected top 10 chi2 indices:", chosen_feats)

# Test different dataset partitions
partitions = {
    "50-50": 0.5,
    "70-30": 0.3,
    "80-20": 0.2
}

final_metrics = []
conf_tables = {}

for split_name, test_size in partitions.items():
    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y_data, test_size=test_size, random_state=42, stratify=y_data
    )

    classifier = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    sens, spec, ppv, npv, cm = calc_metrics(y_test, y_pred)
    final_metrics.append({
        "Partition": split_name,
        "Sensitivity (%)": round(sens, 2),
        "Specificity (%)": round(spec, 2),
        "Positive Predictive Value (%)": round(ppv, 2),
        "Negative Predictive Value (%)": round(npv, 2)
    })
    conf_tables[split_name] = cm

# Table 6: Model performance
table6 = pd.DataFrame(final_metrics)
print("\nTABLE 6: Model performance")
print(table6)

# Table 7: Confusion matrices
print("\nTABLE 7: Confusion matrices for each split")
for name, matrix in conf_tables.items():
    print(f"\n{name} partition confusion matrix:")
    print(pd.DataFrame(
        matrix,
        index=["Actual Benign", "Actual Malignant"],
        columns=["Pred Benign", "Pred Malignant"]
    ))


Top 10 Chi2 feature indices: [ 0  2  3  6  7 20 22 23 26 27]
Top 10 Mutual Info feature indices: [ 0  2  3  6  7 13 20 22 23 27]

Results for SVM + Chi2:
Accuracy: 0.9532
Precision: 1.0000
Recall: 0.8750
F1 Score: 0.9333
Confusion Matrix:
[[107   0]
 [  8  56]]

Results for SVM + Mutual Info:
Accuracy: 0.9415
Precision: 0.9821
Recall: 0.8594
F1 Score: 0.9167
Confusion Matrix:
[[106   1]
 [  9  55]]
Comparing feature sizes (Chi2):

Results for SVM + Chi2 (Top 5):
Accuracy: 0.9474
Precision: 0.9825
Recall: 0.8750
F1 Score: 0.9256
Confusion Matrix:
[[106   1]
 [  8  56]]

Results for SVM + Chi2 (Top 10):
Accuracy: 0.9532
Precision: 1.0000
Recall: 0.8750
F1 Score: 0.9333
Confusion Matrix:
[[107   0]
 [  8  56]]

Results for SVM + Chi2 (Top 15):
Accuracy: 0.9532
Precision: 1.0000
Recall: 0.8750
F1 Score: 0.9333
Confusion Matrix:
[[107   0]
 [  8  56]]

Results for SVM + Chi2 (Top 20):
Accuracy: 0.9883
Precision: 1.0000
Recall: 0.9688
F1 Score: 0.9841
Confusion Matrix:
[[107   0]
 [  2  62]]