## Luke Waninger
#### 25 May 2018 
#### Homework 7

In [1]:
from H7_source import *
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

exp   = np.exp
ident = np.identity
na    = np.newaxis
norm  = np.linalg.norm
np.random.seed(42)

## Exercise 1

### compute the gradient $\triangledown F(\alpha)$ of $F$


$\triangledown F(\alpha) = 1/n \sum_{i=1}^n \triangledown l(y_i, (K\alpha)_i) + 2\lambda K \alpha$



$
\triangledown l(y,t) =
	\begin{cases}
        0 \hspace{55pt}  yt > 1 + h \\
        -y_i x_i \frac{1+h-yt}{2h} \hspace{18pt}  |1-yt| \le h \\
        -y_i x_i \hspace{40pt}  yt < 1-h
	\end{cases}
$


### write a function $\texttt{computegram}$, $\texttt{kerneleval}$
I decided to encapsulate these functions into one class each for radial and polynomial kernels for readability and code reuse. The $\texttt{compute}$ function calculates and returns the requested kernel for any set of observations.

In [2]:
class k_radialrbf(Kernel):
    def __init__(self, sigma):
        super().__init__()
        self.sigma = sigma

    def __str__(self):
        return f'rbf({self.sigma})'

    def compute(self, x, xp=None):
        sigma = self.sigma
        xp = x if xp is None else xp

        def norm(mat):
            return np.linalg.norm(mat, axis=1)

        return exp(-1/(2*sigma**2) * ((norm(x)** 2)[:, na] + (norm(xp)**2)[na, :]-2*(x @ xp.T)))


class k_polynomial(Kernel):
    def __init__(self, degree, b=1.):
        super().__init__()
        self.degree = degree
        self.b = b

    def __str__(self):
        return f'polynomial({self.degree})'

    def compute(self, x, xp=None):
        xp = x if xp is None else xp

        return (x @ xp.T + self.b)**self.degree

### consider the Digits dataset, download and standardize

In [3]:
x, y = load_digits(n_class=10, return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

scalar  = StandardScaler().fit(x_train)
x_train = scalar.transform(x_train)
x_test  = scalar.transform(x_test)

### write a function $\texttt{mysvm}$
Again, I chose to implement this using OOP so I could encapsulate the SVM, and other helper functions in a single object.

In [4]:
class MySVM(Estimator):
    def __init__(self, kernel):
        self.kernel = kernel
    
    def __str__(self):
        return f'SVM(kernel={self.kernel})'
    
    @staticmethod
    def gradient(k, y, beta, l, h=0.5):
        n, d = k.shape
        lg = np.zeros([n, d])

        yk = y *(k @ beta)
        mask = np.abs(1 - yk)

        lg[mask <= h] = ((1/(2*h)) * ((1 + h-yk)[:, na]) * (-y[:, na] * k))[mask <= h]
        lg[yk < 1-h]  = (-y[:, na] * k)[yk < 1-h]

        return np.array(np.sum(lg, axis=0)/n + 2*l*beta)
    
    def fgrad(self, k, y, l, eta=1., max_iter=100):
        n, d  = k.shape
        b0    = np.zeros(d)
        theta = np.copy(b0)
        grad  = self.gradient(k, y, b0, l)

        i = 0
        while i < max_iter and not np.isclose(0, eta):
            eta = backtracking(k, y, b0, l, eta, self.gradient, self.objective)

            b1 = theta - eta*grad
            theta = b1 + (i/(i+3))*(b1-b0)
            grad  = self.gradient(k, y, theta, l)
            b0 = b1

            i += 1

        return b0
    
    @staticmethod
    def objective(k, y, l, beta, h=0.5):
        n, d = k.shape
        loss = np.zeros(n)
        yk = y * (k @ beta)
        mask = np.abs(1 - yk)

        loss[mask <= h] = ((1 + h-yk)**2 / (4*h))[mask <= h]
        loss[yk < 1-h] = (1 - yk)[yk < 1-h]

        return np.sum(loss)/n + l*norm(beta)**2

    def predict(self, kp, beta):
        return [1 if ki @ beta.T > 0 else -1 for ki in kp]
    
    def predict_proba(self, kp, beta):
        return [ki @ beta.T for ki in kp]

### train your SVM with the huberized hinge loss and order 7 polynomial kernel

running one vs rest with a polynomial kernel of degree 7 with $\lambda$=1 gives a horrible validation error: $\approx$ 0.518

In [5]:
kernl = k_polynomial(7)
OVR(MySVM(kernl), n_jobs=-1).fit(x_train, y_train, x_test, y_test, 1.)

fitting <OVR(estimator=SVM(kernel=polynomial(7)) err=0.0)>: 11it [00:42,  2.77s/it]                        

<OVR(estimator=SVM(kernel=polynomial(7)) err=0.5183501683501682)>

using cross-validation we see close to a performance improvement of around 9%

In [6]:
ovr = OVR(MySVM(kernl), n_jobs=-1)
cv(x_train, y_train, ovr, eargs=np.linspace(.001, 1., 5), nfolds=3)

3-Fold CV: <OVR(estimator=SVM(kernel=polynomial(7)) err=0.0)>: 100%|██████████| 50/50 [01:46<00:00,  1.63s/it]

<OVR(estimator=SVM(kernel=polynomial(7)) err=0.42966506201657173)>

### compare the performance of kernel SVMs
It quickly becomes clear the 7-degree polynomial kernel is a horrible choice. Below, I run a series of OVO polynomial and radial kernels that all show much better performance. 

In [7]:
kernels = [
    k_polynomial(1),
    k_polynomial(3),
    k_polynomial(5),
    k_radialrbf(1),
    k_radialrbf(5),
    k_radialrbf(10)
]

for kernl in kernels:
    ovr, eargs = OVR(MySVM(kernl), n_jobs=-1), np.linspace(.001, 1., 5)
    cv(x_train, y_train, ovr, eargs, nfolds=3)

3-Fold CV: <OVR(estimator=SVM(kernel=polynomial(1)) err=0.0)>: 51it [02:02,  1.08s/it]                        

<OVR(estimator=SVM(kernel=polynomial(1)) err=0.018785578418442268)>

3-Fold CV: <OVR(estimator=SVM(kernel=polynomial(3)) err=0.0)>: 100%|██████████| 50/50 [01:51<00:00,  1.38s/it]

<OVR(estimator=SVM(kernel=polynomial(3)) err=0.018785578418442268)>

3-Fold CV: <OVR(estimator=SVM(kernel=polynomial(5)) err=0.0)>: 51it [01:45,  1.56s/it]                        

<OVR(estimator=SVM(kernel=polynomial(5)) err=0.018785578418442268)>

3-Fold CV: <OVR(estimator=SVM(kernel=rbf(1)) err=0.0)>: 51it [01:19,  1.41it/s]                        

<OVR(estimator=SVM(kernel=rbf(1)) err=0.004869540622716667)>

3-Fold CV: <OVR(estimator=SVM(kernel=rbf(5)) err=0.0)>: 51it [01:45,  1.74s/it]                        

<OVR(estimator=SVM(kernel=rbf(5)) err=0.004703471060095379)>

3-Fold CV: <OVR(estimator=SVM(kernel=rbf(10)) err=0.0)>: 100%|██████████| 50/50 [01:50<00:00,  1.48s/it]

<OVR(estimator=SVM(kernel=rbf(10)) err=0.004703471060095379)>

## Exercise 2

In [8]:
def oja(x, t=1., max_iter=50):    
    n, d = x.shape
    w1  = np.random.normal(size=d)
    w1 /= norm(w1)

    for i in range(max_iter):
        w1 = w1 + t*(x.T @ x @ w1)
        w1 = w1/norm(w1)
        step_size = 1/(i+1)

    w1 = w1[:, na]
    C  = x.T @ x @ (ident(d) - w1 @ w1.T)
    
    w2 = np.random.normal(size=d)
    w2 = w2/norm(w2)
    for i in range(max_iter):
        w2  = w2 + t * (C @ w2)
        w2 /= norm(w2)
        step_size = 1/(i+1)

    w2 = w2[:, na]
    return np.concatenate([x@w1, x@w2], axis=1)

### generate a simulated dataset

In [9]:
def gen_k(n, m, scale):
    def gen_ki(m, s):
        return np.random.normal(m, s, 1)
    
    start = n*scale
    means = np.random.uniform(start, start+m, m)
    np.random.shuffle(means)

    return np.array([[gen_ki(mi, 25) for mi in means] for i in range(n)]).reshape((n, m))

### run my oja and plot vs scikit's

In [21]:
n_obs, n_feat, n_klass = 30, 60, 3
x = np.array([gen_k(n_obs, n_feat, i) for i in range(n_klass)]).reshape((n_obs*n_klass, n_feat))
print(x.shape)
mykit = oja(x)
scikt = PCA().fit_transform(X=x)

pca_plot([mykit, scikt], n_obs, n_klass)

(90, 60)


As you can see, the classes are clearly separable with both mine and scikit's displaying comparable performance. 

## data competition project
The following code snippets are utilizing the code base I submitted with last week's homework. I zipped it up here in case you wanted to take a look at it. My ridiculous overkill ml module became much to bloated for the purposes of this assignment. I had created a nice parallel base with a producer consumer style distributed architecture which worked well but, had incurred to much overhead and turned out to be an enormous waste of time. I pulled out the necessary code for this assignment.

In [11]:
def ex2_data(classes=None):
    x_train = np.load('./data/h6/train_features.npy')
    y_train = np.load('./data/h6/train_labels.npy')
    x_val = np.load('./data/h6/val_features.npy')
    y_val = np.load('./data/h6/val_labels.npy')
    x_test = np.load('./data/h6/test_features.npy')

    if classes is not None:
        idx = np.isin(y_train, classes)
        x_train = x_train[idx, :]
        y_train = y_train[idx]

        idx = np.isin(y_val, classes)
        x_val = x_val[idx, :]
        y_val = y_val[idx]

    scalar = StandardScaler().fit(x_train)
    x_train = scalar.transform(x_train)
    x_test = scalar.transform(x_test)
    x_val = scalar.transform(x_val)

    return x_train, y_train, x_val, y_val, x_test

In [None]:
def ex2a_ap():
    from sklearn.svm import LinearSVC
    from sklearn.multiclass import OneVsOneClassifier

    x_train, y_train, x_val, y_val, x_test = ex2_data([2, 3, 4])
    
    cv = LinearSVC()
    cv = OneVsOneClassifier(cv).fit(x_train, y_train)
    predictions = cv.predict(x_val)

    error = np.mean(predictions != y_val)
    with open('ex2a.txt', 'a+') as f:
        f.write(f'ovr: {str(error)}\n')

In [None]:
def ex2a_ovr():
    from sklearn.svm import LinearSVC
    from sklearn.multiclass import OneVsRestClassifier

    x_train, y_train, x_val, y_val, x_test = ex2_data()
    cv = LinearSVC()
    cv = OneVsRestClassifier(cv).fit(x_train, y_train)
    predictions = cv.predict(x_val)

    error = np.mean(predictions != y_val)
    with open('ex2a.txt', 'a+') as f:
        f.write(f'ovr: {str(error)}\n')

In [None]:
def ex2b_ap():
    x_train, y_train, x_val, y_val, x_test = ex2_data()

    ex2b_ap_params = {
        'classifiers': [
            {
                'type': 'linear_svm',
                'parameters': {
                    'loss': ['smoothed_hinge'],
                    'h': [0.5],
                    'algo': ['fgrad'],
                    'alpha': [0.5],
                    'bt_max_iter': [50],
                    'eps': [.001],
                    'eta': [1.],
                    'lambda': 64.,
                    'max_iter': [100],
                    't_eta': [0.8],
                }
            }
        ]
    }

    task = 'ex2b_ap'
    cv = MultiClassifier(x_train=x_train, y_train=y_train, parameters=ex2b_ap_params,
                         x_val=x_val, y_val=y_val, n_jobs=2,
                         classification_method='all_pairs', task=task,
                         log_path='.', logging_level='none').fit()
    cv.output_predictions(x_test)

    predictions = cv.predict(x_val)
    error = 1-np.sum(1 for yh, yt in zip(predictions, y_val) if yh == yt)/len(predictions)
    with open('ex2.txt', 'a+') as f:
        f.write(f'{task}: {str(error)}\n')

In [None]:
def ex2b_ovr():
    x_train, y_train, x_val, y_val, x_test = ex2_data([1, 2, 3])

    ex2b_ovr_params = {
        'classifiers': [
            {
                'type': 'linear_svm',
                'parameters': {
                    'loss': ['smoothed_hinge'],
                    'h': [0.5],
                    'algo': ['fgrad'],
                    'alpha': [0.5],
                    'bt_max_iter': [50],
                    'eps': [.001],
                    'eta': [1.],
                    'lambda': [64.],
                    'max_iter': [100],
                    't_eta': [0.8],
                }
            }
        ]
    }

    task = 'ex2b_ovr'
    cv = MultiClassifier(x_train=x_train, y_train=y_train, parameters=ex2b_ovr_params,
                         x_val=x_val, y_val=y_val, n_jobs=2,
                         classification_method='ovr', task=task,
                         log_path='.', logging_level='none').fit()
    cv.output_predictions(x_test)

    predictions = cv.predict(x_val)
    error = 1 - np.sum(1 for yh, yt in zip(predictions, y_val) if yh == yt) / len(predictions)
    with open('ex2.txt', 'a+') as f:
        f.write(f'{task}: {str(error)}\n')