In [1]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import numpy as np
import matplotlib.pyplot as plt

from cv_svm import SVM_smooth

In [2]:
X, y = load_breast_cancer(return_X_y=True)
y[np.where(y == 0)] = -1

In [3]:
X = StandardScaler().fit_transform(X)
print(X.shape)

(569, 30)


In [4]:
clf = SVC(kernel='linear')
clf.fit(X, y)
sk_coef = clf.coef_/np.linalg.norm(clf.coef_)
print(clf.coef_)
print(clf.intercept_)

[[-0.32157975 -0.0963414  -0.29618472 -0.27044914  0.0145657   0.61925944
  -0.75749079 -0.90968508 -0.07847361  0.34800574 -0.84095245  0.30550603
  -0.23541953 -0.89135374 -0.35509455  0.39115374  0.37739417 -0.46041889
   0.10093298  0.88583541 -0.59013908 -0.97190388 -0.33349947 -0.71227359
  -0.42708268  0.1721726  -1.03818882 -0.09378355 -0.44679874 -0.85526809]]
[0.04438971]


In [5]:
y_pred = clf.predict(X)
accuracy_score(y, y_pred)

0.9876977152899824

In [None]:
clf = SVM_smooth(sigma=2e-35, lbd=1e-3)
clf.fit(X, y, thresh=5e-4, n_iter=500, eta=0.4, approx_cv=True)
print(f"grad {np.linalg.norm(clf.nabla_fgd_(X, y, clf.weights_, clf.sigma_, clf.lbd_))}")
coef = clf.weights_/np.linalg.norm(clf.weights_)
print(f"diff {np.mean(np.abs(coef- sk_coef))} | pct {np.mean(np.abs(coef - sk_coef))/np.sum(np.abs(coef))}")

y_pred = clf.predict(X)
accuracy_score(y, y_pred)

IACV: 0.9869231481559192 | baseline: 0.003470265995856574
IACV: 0.9875296485630265 | baseline: 0.004264783595645942
IACV: 0.9874006924834106 | baseline: 0.004911039442670612
IACV: 0.9873050239314632 | baseline: 0.005667275199972825
IACV: 0.987242606550302 | baseline: 0.006493290641890296
IACV: 0.987213400439479 | baseline: 0.00736575811233502
IACV: 0.9872173621696753 | baseline: 0.008270521845817537
IACV: 0.9872544447994075 | baseline: 0.009198584732675357
IACV: 0.9873245978937253 | baseline: 0.010143984728577162
IACV: 0.9874277675448734 | baseline: 0.011102625358273024
IACV: 0.987563896394889 | baseline: 0.01207160303290357
IACV: 0.9877329236601022 | baseline: 0.013048803896457467
IACV: 0.9879347851575093 | baseline: 0.014032653059305145
IACV: 0.9881694133329749 | baseline: 0.015021953371600148
IACV: 0.9884367372912346 | baseline: 0.01601577867999642
IACV: 0.9887366828276454 | baseline: 0.017013401293889185
IACV: 0.9890691724616505 | baseline: 0.01801424154431047
IACV: 0.9894322320964

In [None]:
clf.loo_iacv_

In [None]:
clf.loo_true_

In [None]:
iacv_mean = np.mean(clf.loo_iacv_, axis=0)
coef = iacv_mean/np.linalg.norm(iacv_mean)
print(f"diff {np.mean(np.abs(coef - sk_coef))} | pct {np.mean(np.abs(coef - sk_coef))/np.sum(np.abs(coef))}")

In [None]:
true_cv_mean = np.mean(clf.loo_true_, axis=0)
coef = true_cv_mean/np.linalg.norm(true_cv_mean)
print(f"diff {np.mean(np.abs(coef - sk_coef))} | pct {np.mean(np.abs(coef - sk_coef))/np.sum(np.abs(coef))}")

In [None]:
coef = clf.weights_/np.linalg.norm(clf.weights_)
nbin = 100
plt.hist(coef, bins=nbin, label="FGD", alpha=0.3)
plt.hist(sk_coef.flatten(), bins=nbin, label="sklearn", alpha=0.3)
plt.legend()
plt.show()

In [None]:
# test different sigma values
sigmas = [2e-30, 2e-25, 2e-15, 2e-1]
for s in sigmas:
    if s == 0:
        continue
    clf = SVM_smooth(sigma=s, lbd=1)
    clf.fit(X, y, thresh=5e-3, n_iter=2000, eta=0.6 * s, cv=False, approx_cv=False)
    y_pred = clf.predict(X)
    score = accuracy_score(y, y_pred)

    coef = clf.weights_/np.linalg.norm(clf.weights_)

    print(f"sigma {s} | score {score} | grad {np.linalg.norm(clf.nabla_fgd_(X, y, clf.weights_, clf.sigma_, clf.lbd_))} | sklearn diff {np.mean(np.abs(coef- sk_coef))}")