In [1]:
# Useful starting lines
%matplotlib inline

import random
from datetime import datetime

import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Support Vector Machines
## Classification Using SVM
Load dataset. We will re-use the CERN dataset from project 1, available from https://inclass.kaggle.com/c/epfml-project-1/data

In [44]:
from helpers import load_csv_data

DATA_TRAIN_PATH = 'data/train.csv'

y, X, ids = load_csv_data(DATA_TRAIN_PATH, sub_sample=True)
print(y.shape, X.shape)

(5000,) (5000, 30)


## Prepare cost and prediction functions

The primal objective for SVM is:

$$\mathcal L (\mathbf w) = \sum_{n=1}^N [1- y_n \mathbf x_n^\top \mathbf w]_+ +  \frac{\lambda}{2}\mathbf w^\top \mathbf w$$

In [21]:
def calculate_primal_objective(y, X, w, lambda_):
    """compute the full cost (the primal objective), that is loss plus regularizer.
    X: the full dataset matrix, shape = (num_examples, num_features)
    y: the corresponding +1 or -1 labels, shape = (num_examples)
    w: shape = (num_features)
    """
    loss = np.maximum(0, (1 - y * X.dot(w))).sum()
    reg = lambda_ / 2 * w.dot(w)
    return loss + reg

In [43]:
xtest = np.array([[0,1,-1]])
wtest = np.array([-2,0,1])
ytest = np.array([1])
assert calculate_primal_objective(ytest, xtest, wtest, 0) == 2

In [3]:
def calculate_accuracy(y, X, w):
    """compute the training accuracy on the training set (can be called for test set as well).
    X: the full dataset matrix, shape = (num_examples, num_features)
    y: the corresponding +1 or -1 labels, shape = (num_examples)
    w: shape = (num_features)
    """
    yhat = X.dot(w) >= 0
    # Map to {-1, 1}
    yhat = (yhat * 2) - 1
    return (yhat == y).mean()

## Stochastic Gradient Descent for SVM

Compute the (stochastic) subgradient for the n-th summand of the SVM optimization objective

---

Given

$$l:\mathbb R \rightarrow \mathbb R, \; z \mapsto \max(0, 1-z) = 
\begin{cases}
1 - z & \text{if } z \le 1\\
0 & \text{otherwise}
\end{cases}$$

We have

$$\frac{\partial l}{\partial z} (z) = \begin{cases}
-1 & \text{if } z < 1\\
0&\text{otherwise}
\end{cases}$$

and is not defined for $z=1$. We thus have to compute the **subgradient**:

$$\partial l(z=1) \in [-1; 0]$$

We could theoretically pick any value. When $z=1$, we've just reached a value we are confident enough in so that we do not penalize it anymore. We might want a non-zero subgradient value for $z=1$ so that we have a bit more margin of confidence. Some value between -1 and 0 (e.g. -0.5) would probably be the best (?), but if we care about implementation efficiency, we should consider either -1 or 0.

Let's call $D_{l(z)}$ the gradient of $l$ where it is defined and $-1$ when it's not. Using the chain rule of derivation:

$$D_{l(\mathbf z = y_n\mathbf x_n^\top\mathbf w)} = \begin{cases}
-y_n\mathbf x_n & \text{if } y_n\mathbf x_n^\top\mathbf w \le 1\\
0 &\text{otherwise}
\end{cases}$$

---

In [23]:
def calculate_stochastic_gradient(y, X, w, lambda_, n, num_examples):
    """compute the stochastic gradient of loss plus regularizer.
    X: the dataset matrix, shape = (num_examples, num_features)
    y: the corresponding +1 or -1 labels, shape = (num_examples)
    w: shape = (num_features)
    n: the index of the (one) datapoint we have sampled
    num_examples: N
    """
    # Be careful about the constant N (size) term!
    # The complete objective for SVM is a sum, not an average as in earlier SGD examples!
    x_n, y_n = X[n], y[n]
    
    # Save element-wise product to perform it only once
    ynxn = y_n * x_n
    non_zero_case = ynxn.dot(w) <= 1
    
    return non_zero_case * (-ynxn)

Implement stochastic gradient descent: Pick a data point uniformly at random and update w based on the gradient for the n-th summand of the objective

In [42]:
y

array([1])

In [47]:
def sgd_for_svm_demo(y, X):
    
    max_iter = 100000
    gamma = 1
    lambda_ = 0.01
    
    num_examples, num_features = X.shape
    w = np.zeros(num_features)
    
    for it in range(max_iter):
        # n = sample one data point uniformly at random data from x
        n = random.randint(0,num_examples-1)
        
        grad = calculate_stochastic_gradient(y, X, w, lambda_, n, num_examples)
        w -= gamma/(it+1) * grad
        
        if it % 10000 == 0:
            cost = calculate_primal_objective(y, X, w, lambda_)
            print("iteration={i}\t\tcost={c}".format(i=it, c=cost))
    
    print("training accuracy = {l}".format(l=calculate_accuracy(y, X, w)))

sgd_for_svm_demo(y, X)

iteration=0		cost=8386119770.757722
iteration=10000		cost=152626542.08219898
iteration=20000		cost=147812820.54694992
iteration=30000		cost=145225574.85023817
iteration=40000		cost=143173523.42858002
iteration=50000		cost=141773753.73270252
iteration=60000		cost=140568868.1734363
iteration=70000		cost=139630640.34684396
iteration=80000		cost=138757998.08359948
iteration=90000		cost=138136797.23861492
training accuracy = 0.6398


## Coordinate Descent (Ascent) for SVM

Compute the closed-form update for the n-th variable alpha, in the dual optimization problem, given alpha and the current corresponding w

In [None]:
def calculate_coordinate_update(y, X, lambda_, alpha, w, n):
    """compute a coordinate update (closed form) for coordinate n.
    X: the dataset matrix, shape = (num_examples, num_features)
    y: the corresponding +1 or -1 labels, shape = (num_examples)
    w: shape = (num_features)
    n: the coordinate to be updated
    """
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************
    # calculate the update of coordinate at index=n.
    x_n, y_n = X[n], y[n]
    old_alpha_n = np.copy(alpha[n])
    
    raise NotImplementedError
    return w, alpha

In [None]:
def calculate_dual_objective(y, X, w, alpha, lambda_):
    """calculate the objective for the dual problem."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************
    raise NotImplementedError

In [None]:
def coordinate_descent_for_svm_demo(y, X):
    max_iter = 100000
    lambda_ = 0.01

    num_examples, num_features = X.shape
    w = np.zeros(num_features)
    alpha = np.zeros(num_examples)
    
    for it in range(max_iter):
        # n = sample one data point uniformly at random data from x
        n = random.randint(0,num_examples-1)
        
        w, alpha = calculate_coordinate_update(y, X, lambda_, alpha, w, n)
            
        if it % 10000 == 0:
            # primal objective
            primal_value = calculate_primal_objective(y, X, w, lambda_)
            # dual objective
            dual_value = calculate_dual_objective(y, X, w, alpha, lambda_)
            # primal dual gap
            duality_gap = primal_value - dual_value
            print('iteration=%i, primal:%.5f, dual:%.5f, gap:%.5f'%(
                    it, primal_value, dual_value, duality_gap))
    print("training accuracy = {l}".format(l=calculate_accuracy(y, X, w)))

coordinate_descent_for_svm_demo(y, X)