In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score, jaccard_score,
    confusion_matrix, roc_curve, auc
)
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load and prepare data
df = pd.read_csv('samples_cancer.csv')
df['BareNuc'] = pd.to_numeric(df['BareNuc'], errors='coerce')
df = df.dropna()
X = df.drop(['ID', 'Class'], axis=1)
y = df['Class'].map({2:0, 4:1})

# Standardization for SVM performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=42, stratify=y)

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
svm_results = {}

for kernel in kernels:
    model = SVC(kernel=kernel, probability=True, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    acc = accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    jaccard = jaccard_score(y_test, y_pred)
    err_rate = 1 - acc
    cm = confusion_matrix(y_test, y_pred)
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    svm_results[kernel] = {
        'acc': acc,
        'rec': rec,
        'prec': prec,
        'f1': f1,
        'jaccard': jaccard,
        'err_rate': err_rate,
        'cm': cm,
        'fpr': fpr,
        'tpr': tpr,
        'roc_auc': roc_auc
    }

    print(f"Kernel = {kernel}")
    print("Accuracy:", f"{acc:.4f}")
    print("Recall:", f"{rec:.4f}")
    print("Precision:", f"{prec:.4f}")
    print("F1-score:", f"{f1:.4f}")
    print("Jaccard score:", f"{jaccard:.4f}")
    print("Error rate:", f"{err_rate:.4f}")
    print("Confusion matrix:\n", cm)
    print()

# Plot ROC curve for all four SVM models
plt.figure(figsize=(8,6))
for kernel in kernels:
    fpr = svm_results[kernel]['fpr']
    tpr = svm_results[kernel]['tpr']
    roc_auc = svm_results[kernel]['roc_auc']
    plt.plot(fpr, tpr, label=f"{kernel} (AUC = {roc_auc:.2f})")

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison of SVM Kernels')
plt.legend(loc='lower right')
plt.show()


GaussianNB - Accuracy: 0.7135, F1-Score: 0.5985
BernoulliNB - Accuracy: 0.6510, F1-Score: 0.1299
