In [69]:
import sys

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import Perceptron as SkPerceptron
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

SEED: int = 2132
USE_STANDARD_SCALER: bool = True

In [70]:
X, y = load_breast_cancer(return_X_y=True)

y = np.where(y == 1, 1, -1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=SEED, stratify=y
)

In [71]:
scaler = StandardScaler() if USE_STANDARD_SCALER else MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Scaling is performed *after* splitting because otherwise, the rows corresponding to the test set would bias the computed mean and standard deviation, resulting in the model performing well on the test set, but perhaps not on unseen data.

This phenomenon is referred to as data leakage.

In [None]:
class Perceptron:
    def __init__(self, iters: int = 1000):
        self.iters: int = iters
        self.w = None

    def fit(self, X_train, y_train) -> None:
        if self.w is not None:
            print("Overwriting previous weights.", file=sys.stderr)

        n_dim: int = X_train[0].shape[0]

        # self.w = np.random.randn(n_dim)
        rng = np.random.default_rng(SEED)
        self.w = rng.standard_normal(n_dim)

        for i in range(self.iters):
            for x, y in zip(X_train, y_train):
                if y * (self.w @ x) < 0:
                    self.w = self.w + (y * x)

        return

    def predict(self, X_test):
        if self.w is None:
            raise RuntimeError("Model has not been trained.")

        n_rows: int = X_test.shape[0]
        y_pred = np.zeros(n_rows)

        for i, x in enumerate(X_test):
            y_pred[i] = 1 if self.w @ x > 0 else -1

        return y_pred

In [73]:
p = Perceptron()
p.fit(X_train, y_train)
y_pred = p.predict(X_test)
print("Accuracy (Custom):", accuracy_score(y_test,y_pred))

Accuracy (Custom): 0.9707602339181286


In [74]:
clf = SkPerceptron(tol=1e-3, random_state=SEED)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy (Scikit Perceptron):", accuracy_score(y_test,y_pred))

Accuracy (Scikit Perceptron): 0.9707602339181286


In [75]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy (Scikit KNN):", accuracy_score(y_test,y_pred))

Accuracy (Scikit KNN): 0.9532163742690059


# Observations

- After applying the standard scaler, all 3 approaches had their accuracy increase
- Notably, my answer exactly matched scikit's implementation
- Using MinMaxScaler worsened the perceptron-classifier accuracy
- The KNN solution's accuracy remained consistent regardless of the scaler chosen