# Polynomial-Kernel SVM from Scratch on Breast Cancer Data
The Breast Cancer Wisconsin dataset has 30 real-valued features describing cell nuclei. A polynomial kernel lets an SVM separate classes using curved decision boundaries without explicitly mapping data into the high-dimensional feature space. Here we implement that mapping manually by expanding each feature vector into all degree-2 combinations (squares and pairwise products), then train a linear SVM with hinge-loss minimisation via stochastic sub-gradient descent. Finally we evaluate with a confusion matrix, precision/recall/F1, and an ROC curve to inspect the trade-off between true positives and false positives.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [None]:
data = load_breast_cancer()
X = data.data.astype(float)
y = data.target  # 0 = malignant, 1 = benign

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Standardise features based on the training split for stability
mean = X_train_raw.mean(axis=0)
std = X_train_raw.std(axis=0)
std[std == 0] = 1.0  # guard against division by zero
X_train_scaled = (X_train_raw - mean) / std
X_test_scaled = (X_test_raw - mean) / std

print("Training samples:", X_train_scaled.shape[0])
print("Testing samples:", X_test_scaled.shape[0])

In [None]:
def polynomial_features_degree_2(X):
    """Create degree-2 polynomial features (linear + squares + pairwise products)."""
    linear = X
    squares = X ** 2
    interactions = []
    n_features = X.shape[1]
    for i in range(n_features):
        for j in range(i + 1, n_features):
            interactions.append((X[:, i] * X[:, j]).reshape(-1, 1))
    if interactions:
        interactions = np.hstack(interactions)
        return np.hstack([linear, squares, interactions])
    return np.hstack([linear, squares])

X_train_poly = polynomial_features_degree_2(X_train_scaled)
X_test_poly = polynomial_features_degree_2(X_test_scaled)

y_train = np.where(y_train_raw == 1, 1, -1)  # 1 = benign, -1 = malignant
y_test = np.where(y_test_raw == 1, 1, -1)

print("Feature dimension after expansion:", X_train_poly.shape[1])

In [None]:
def train_linear_svm(X, y, learning_rate=5e-4, lambda_reg=1e-2, epochs=20):
    weights = np.zeros(X.shape[1])
    bias = 0.0

    for epoch in range(epochs):
        for xi, yi in zip(X, y):
            margin = yi * (np.dot(xi, weights) + bias)
            if margin >= 1:
                weights -= learning_rate * (2 * lambda_reg * weights)
            else:
                weights -= learning_rate * (2 * lambda_reg * weights - yi * xi)
                bias += learning_rate * yi
    return weights, bias

def decision_function(X, weights, bias):
    return X @ weights + bias

weights, bias = train_linear_svm(X_train_poly, y_train)
scores = decision_function(X_test_poly, weights, bias)
pred_labels = np.where(scores >= 0, 1, -1)

In [None]:
pred_binary = (pred_labels == 1).astype(int)

print("Confusion matrix (rows=true, cols=predicted):")
print(confusion_matrix(y_test_raw, pred_binary))

print("\nDetailed report:")
print(classification_report(y_test_raw, pred_binary, target_names=["malignant", "benign"]))

fpr, tpr, _ = roc_curve(y_test_raw, scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(5, 4))
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.3f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Polynomial SVM")
plt.legend()
plt.grid(alpha=0.3)
plt.show()