## Luke Waninger
#### 25 May 2018 
#### Homework 7

In [7]:
from H7_source import *
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'


exp = np.exp
na = np.newaxis
norm = np.linalg.norm
ident = np.identity
np.random.seed(42)

## Exercise 1

### compute the gradient $\triangledown F(\alpha)$ of $F$


In [10]:
def gradient(k, y, beta, l, h=0.5):
    n, d = k.shape
    lg = np.zeros([n, d])

    yk = y *(k @ beta)
    mask = np.abs(1 - yk)

    lg[mask <= h] = ((1/(2*h)) * ((1 + h-yk)[:, na]) * (-y[:, na] * k))[mask <= h]
    lg[yk < 1-h]  = (-y[:, na] * k)[yk < 1-h]

    return np.array(np.sum(lg, axis=0)/n + 2*l*beta)

### write a function $\texttt{computegram}$, $\texttt{kerneleval}$
I decided to encapsulate these functions into one class each for radial and polynomial kernels for readability and code reuse.

In [11]:
class k_radialrbf(Kernel):
    def __init__(self, sigma):
        super().__init__()
        self.sigma = sigma

    def __str__(self):
        return f'rbf({self.sigma})'

    def compute(self, x, xp=None):
        sigma = self.sigma
        xp = x if xp is None else xp

        def norm(mat):
            return np.linalg.norm(mat, axis=1)

        return exp(-1 / (2 * sigma ** 2) * ((norm(x) ** 2)[:, na] + (norm(xp) ** 2)[na, :] - 2 * (x @ xp.T)))


class k_polynomial(Kernel):
    def __init__(self, degree, b=1.):
        super().__init__()
        self.degree = degree
        self.b = b

    def __str__(self):
        return f'polynomial({self.degree})'

    def compute(self, x, xp=None):
        xp = x if xp is None else xp

        return (x @ xp.T + self.b) ** self.degree


### consider the Digits dataset, download and standardize

In [12]:
x, y = load_digits(n_class=10, return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

scalar  = StandardScaler().fit(x_train)
x_train = scalar.transform(x_train)
x_test  = scalar.transform(x_test)

### write a function $\texttt{mysvm}$

In [13]:
class MySVM(Estimator):
    def __init__(self, kernel):
        self.kernel = kernel
        self.gradient = gradient
    
    def __str__(self):
        return f'SVM(kernel={self.kernel})'
    
    def fgrad(self, k, y, l, eta=1., max_iter=100):
        n, d  = k.shape
        b0    = np.zeros(d)
        theta = np.copy(b0)
        grad  = self.gradient(k, y, b0, l)

        i = 0
        while i < max_iter and not np.isclose(0, eta):
            eta = backtracking(k, y, n, b0, l, eta, self.gradient, self.objective)

            b1 = theta - eta*grad
            theta = b1 + (i/(i+3))*(b1-b0)
            grad  = self.gradient(k, y, theta, l)
            b0 = b1

            i += 1
        
        return b0
    
    @staticmethod
    def objective(k, y, l, beta, h=0.5):
        n, d = k.shape
        loss = np.zeros(n)
        yk = y * (k @ beta)
        mask = np.abs(1 - yk)

        loss[mask <= h] = ((1 + h-yk)**2 / (4*h))[mask <= h]
        loss[yk < 1-h] = (1 - yk)[yk < 1-h]

        return np.sum(loss)/n + l*norm(beta)**2

    def predict(self, kp, beta):
        return [1 if ki @ beta.T > 0 else -1 for ki in kp]
    
    def predict_proba(self, kp, beta):
        return [ki @ beta.T for ki in kp]

In [14]:
k_polynomial(7).compute(x_train)

array([[ 1.26265062e+12,  7.47363597e+05, -1.93224148e+08, ...,
        -1.38232013e+03,  4.39391201e+06, -4.66542513e-05],
       [ 7.47363597e+05,  1.48105710e+11,  1.92712053e+06, ...,
         7.61470741e+07,  6.32965585e+04,  2.78854033e+10],
       [-1.93224148e+08,  1.92712053e+06,  2.51973127e+11, ...,
         1.44519975e+08,  9.44285484e+01,  9.80161028e+06],
       ...,
       [-1.38232013e+03,  7.61470741e+07,  1.44519975e+08, ...,
         1.76237741e+11,  3.84046216e+04,  3.65261096e-01],
       [ 4.39391201e+06,  6.32965585e+04,  9.44285484e+01, ...,
         3.84046216e+04,  3.46155188e+10, -5.44572989e-01],
       [-4.66542513e-05,  2.78854033e+10,  9.80161028e+06, ...,
         3.65261096e-01, -5.44572989e-01,  1.27256605e+11]])

In [15]:
kernl = k_polynomial(7)
timeit(lambda: MySVM(kernl).fgrad(kernl.compute(x_train), y_train, 1.))

0:00:08.831393


array([2.68520820e+09, 5.61695623e+08, 2.27322493e+09, ...,
       1.55505242e+08, 2.63093661e+08, 2.58941755e+08])

running one vs rest with a polynomial kernel of degree 7 gives a horrible validation error: $\approx$ 0.518

In [18]:
#OVR(MySVM(kernl), n_jobs=-1).fit(x_train, y_train, x_test, y_test, 1.)

using cross-validation we see a slight improvement

In [19]:
def cv(x, y, x_test, y_test, estimator, eargs, nfolds=3):
    pbar = track_bar(track_total(estimator, eargs), desc=f'{nfolds}-Fold CV: {estimator}, reg: {eargs}')
    step = int(x.shape[0]/nfolds)
    
    for arg in eargs:
        tidx = np.random.choice(np.arange(len(y)), 2*step)
        vidx = list(set(np.arange(len(y))) - set(tidx))
        xa, ya, xva, yva = x[tidx, :], y[tidx], x[vidx, :], y[vidx]
        estimator = estimator.fit(xa, ya, xva, yva, arg, pbar)
        
    [pbar.put(f) for f in [1, 'END_FLAG']]
    return estimator

In [21]:
ovr = OVR(MySVM(kernl), n_jobs=-1)
cv(x_train, y_train, x_test, y_test, ovr, np.linspace(.001, 1., 5))
print(ovr.kl_args)

3-Fold CV: <OVR(estimator=SVM(kernel=polynomial(7)) err=0.0)>, reg: [0.001   0.25075 0.5005  0.75025 1.     ]: 100%|██████████| 50/50 [01:05<00:00,  1.39it/s]

<OVR(estimator=SVM(kernel=polynomial(7)) err=0.42966506201657173)>

0VR lambda=0.001
1VR lambda=0.5005
2VR lambda=0.001
3VR lambda=0.001
4VR lambda=1.0
5VR lambda=0.001
6VR lambda=0.001
7VR lambda=0.001
8VR lambda=0.001
9VR lambda=0.001


It quickly becomes clear the 7-degree polynomial kernel is a horrible choice. Below, I run a series of OVO polynomial and radial kernels that all show much better performance. 

In [None]:
kernels = [
    k_polynomial(1),
    k_polynomial(3),
    k_polynomial(5),
    k_radialrbf(1),
    k_radialrbf(5),
    k_radialrbf(10)
]

for kernl in kernels:
    ovr, eargs = OVR(MySVM(kernl), n_jobs=-1), np.linspace(.001, 1., 5)
    cv(x_train, y_train, x_test, y_test, ovr, eargs)