# Primal Support Vector Machines

Let $X \in R^{n \times d+1}$ and $y = (y_1,...,y_n)^T \in R^{n+1}$ and $\texttt{loss}(...) \ge 0$

Objective function: 
$$F(w) = \|w\|^2 + \frac Cn \|\texttt{loss}(y,Xw)\|_1$$

###### For Square Hinge Loss:
$$l_{square-hinge}(y,t) := \max(0, 1 - yt)^2$$
Then
$$\frac{d}{dt}l_{square-hinge}(y,t) := \begin{cases} 0, & \mbox{if } 1-yt \lt 0\\ 
2(1 - yt)(-y), & \mbox{if } 1-yt \gt 0  \end{cases}$$
And
$$F(w) = \|w\|^2 + \frac Cn \sum^n\max(0, 1 - y*(Xw))^2$$
And $\texttt{for j = 1,2,...,d+1}$
$$(\vec\nabla F(w))_j = \begin{cases} 2w_j, & \mbox{if } 1-y*(Xw) \le 0\\
2w_j + \frac Cn \sum^{n}_{i=1} 2(1 - y*(Xw))(-y) \cdot X_{i,j}, & \mbox{if } 1-y*(Xw) \gt 0  \end{cases}$$
Where $y*(Xw) \in R^{n}$<br />

## SVM Implementation with squared hinge loss using Newton's method

In [None]:
from sklearn.datasets import make_regression
from sklearn.cross_validation import train_test_split
from sklearn.kernel_ridge import KernelRidge
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
import sys
import numpy as np
from numpy.linalg import norm
from numpy.random import randint
import random
import math
from math import exp
import time 


def kernel(xi,xj): # const
#     sigma = 8
#     return exp(-(norm(xi-xj)**2)/(2*(sigma**2))) # Gaussian
    return np.dot(xi,xj)


def loss(y,t): # const
    return pow(max(0,1-(y*t)),2)


def loss_der(y,t): # const
    if 1-(y*t) <= 0:
        return 0
    return 2*(1-(y*t))*(-y)


def H(C,X,sv): # const
    d = len(X[0])
    summation = np.zeros((d,d))
    for i in sv:
        summation += np.dot(X[i],X[i].T)
    return np.eye(d) + C*summation


def H_inv(C,X,sv): # const
    return np.linalg.inv(H(C,X,sv))


def compute_obj(w,C,X,y): # const
    ret = 0.0
    assert len(X)==len(y)
    assert len(X[0])==len(w)
    for i in range(len(X)):
        ret += loss(y[i],kernel(X[i],w))
    return norm(w)**2 + C*ret


def compute_grad(w,C,X,sv,y): # const
    n=len(sv)
    X[len(X)-1,len(w)-1]
    grad = 2*w.copy()
    for i in range(n):
        assert len(X) > sv[i]
        grad += 2*C*(np.dot(w,X[sv[i]])-y[sv[i]])*X[sv[i]]
    return grad


def numer_grad(w,ep,delta,C,X,y): # const
    return (compute_obj(w+(ep*delta),C,X,y) \
           -compute_obj(w-(ep*delta),C,X,y))/(2*ep)


def grad_checker(w0,C,X,y): # const
    ep=.0001
    delta=0
    d=len(w0)
    w=[]
    for i in range(d):
        delta=np.zeros(w0.shape)
        delta[i] = 1
        w.append(numer_grad(w0,ep,delta,C,X,y))
    return np.asarray(w)

def my_gradient_descent(X,y,sv,w0=None,initial_step_size=.1,max_iter=1000,C=1,
                        X_test=None, y_test=None,descent_type=''): # const
    tol=10**-4 # scikit learn default
    if w0 == None:
        w0 = np.zeros(len(X[0]))
    if len(X) == 0:
        return 'Error'
    diff = -1
    grad = -1
    
    w = w0
    obj_array = []

    training_error_array = []
    training_error_array.append(score(X, y, w=w))

    testing_error_array = []
    testing_error_array.append(score(X_test, y_test, w=w))
    
    w_array = []
    w_array.append(w.copy())
    
    for i in range(max_iter):
        print 'i',i
        obj=compute_obj(w,C,X,y)
        print 'obj',obj
        obj_array.append(obj)
        w_p = w
#         if descent_type == 'stochastic':
#             random_index = randint(1,len(X))
#             grad = compute_grad(w,C,X[random_index],y[random_index])
#         else:
        grad = compute_grad(w,C,X,sv,y)
#             assert norm(grad-grad_checker(w,C,X,y)) < tol
        print 'grad',norm(grad)
#         print 'grad_checker',norm(grad_checker(w,C,X,y))
        if norm(grad) < tol:
            break
        
#         step_size = initial_step_size
#         if back_track:
#             while obj <= compute_obj(w - (step_size * grad),C,X,y):
#                 step_size = step_size/2.0
#         print 'step_size',step_size
#         w += - step_size * grad
        gamma = 0.9
        step= - gamma * np.dot(H_inv(C,X,sv), grad)
        print 'step',norm(step)
        w += step
        print 'w',norm(w)

        w_array.append(w.copy())
        training_error_array.append(score(X, y, w=w))
        testing_error_array.append(score(X_test, y_test, w=w))
        
        if obj*10 < compute_obj(w,C,X,y):
            break
            
        diff = norm(w-w_p)
    if norm(grad) > tol:
        print 'Warning: Did not converge.'
    return w, w_array, obj_array, training_error_array, testing_error_array
    
def score(X, y, w): # const
    error = 0.0
    error_comp = 0.0
    for i in range(len(X)):
        prediction = np.sign(kernel(w,X[i]))
        if prediction == 1 and y[i] == 1:
            error += 1
        elif (prediction == -1 or prediction == 0) and y[i] == -1:
            error += 1
        else:
            error_comp += 1
    return 'correct',error/len(X), 'failed',error_comp/len(X)


def my_svm(X_train, y_train,sv,max_iter=None,C=None,X_test=None, y_test=None): # const
    w0=np.zeros(len(X_train[0]))
    w, w_array, obj_array, training_error_array, testing_error_array = \
                    my_gradient_descent(X_train, y_train,sv,w0=w0, max_iter=max_iter,C=C,X_test=X_test, y_test=y_test)
    return w, w_array, obj_array, training_error_array, testing_error_array
    

## SVM Usage Example

In [None]:
X = []
y = []
with open('zip.train', 'r') as f:
    for line in f:
        line_split = line.split()
        y.append(float(line_split[0]))
        X.append(line_split[1:])
X = np.asarray(X)
X = X.astype(np.float32, copy=False)
y = np.asarray(y)

# append constant dimension
# X = np.column_stack((X, np.ones(X.shape[0])))

y[y < 4.5] = -1
y[y >= 4.5] = 1

assert len(X)==len(y);assert len(X[y==-1])==len(y[y==-1]);assert len(X[y==1])==len(y[y==1]);


X_test = []
y_test = []
with open('zip.test', 'r') as f:
    for line in f:
        line_split = line.split()
        y_test.append(float(line_split[0]))
        X_test.append(line_split[1:])
X_test = np.asarray(X_test)
X_test = X_test.astype(np.float32, copy=False)
y_test = np.asarray(y_test)

y_test[y_test < 4.5] = -1
y_test[y_test >= 4.5] = 1

X_train, ignore, y_train, ignore = train_test_split(X, y, train_size=0.05, random_state=20140210)
print X_train.shape
assert len(X_train)>1;assert len(X_test)>1;assert len(X_train)==len(y_train);assert len(X_test)==len(y_test)

max_iter=100
C=1.0

sv = range(len(X_train))
w, w_array, obj_array, training_error_array, testing_error_array = my_svm(X_train, y_train, sv, max_iter=max_iter,C=C,
                                                                          X_test=X_test, y_test=y_test)
print 'Custom w =',norm(w),' test score = ',score(X_test, y_test, w=w)

from sklearn import svm
clf = svm.SVC(C=C)
clf.fit(X_train, y_train)
print 'SVC',' test score = ',clf.score(X_test, y_test)

clf = SGDClassifier(loss='squared_hinge', penalty="l2",alpha=1/C, fit_intercept=False)
clf.fit(X_train, y_train); assert clf.intercept_ == 0
print 'SGDClassifier w = ',norm(clf.coef_[0]),' test score = ',clf.score(X_test, y_test)
#score(X_test, y_test,w=clf.coef_[0])

clf = LinearSVC(loss='squared_hinge', penalty="l2",C=C, fit_intercept=False); clf.fit(X_train, y_train)
assert clf.intercept_ == 0
print 'LinearSVC w = ',norm(clf.coef_[0]),' test score = ',clf.score(X_test, y_test)
#score(X_test, y_test, w=clf.coef_[0])


In [None]:
%matplotlib nbagg
plt.clf()
plt.cla()

ax = plt.subplot(1,1,1)

w_array = np.asarray(w_array)
ax.scatter(w_array[:,0],w_array[:,1],marker='^')

w_stoch_array = np.asarray(w_stoch_array)
ax.scatter(w_stoch_array[:,0],w_stoch_array[:,1],marker='*')

handles, labels = ax.get_legend_handles_labels()
plt.title('First Two Dimensions of Hyperplane over iterations')
plt.ylabel('w [1]')
plt.xlabel('w [0]')

In [None]:
%matplotlib nbagg
plt.clf()
plt.cla()

ax = plt.subplot(1,1,1)

obj_array = np.asarray(obj_array)
ax.scatter(range(1,len(obj_array)+1),obj_array,marker='^')

obj_stoch_array = np.asarray(obj_stoch_array)
ax.scatter(range(1,len(obj_stoch_array)+1),obj_stoch_array,marker='*')

handles, labels = ax.get_legend_handles_labels()
plt.title('Objective over iterations')
plt.ylabel('F (w)')
plt.xlabel('Iteration')

In [None]:
%matplotlib nbagg
plt.clf()
plt.cla()

ax = plt.subplot(1,1,1)

training_error_array = np.asarray(training_error_array)
testing_error_array = np.asarray(testing_error_array)
ax.scatter(range(1,len(training_error_array)+1),training_error_array[:,1],marker='^',label='training error')
ax.scatter(range(1,len(testing_error_array)+1),testing_error_array[:,1],marker='*',label='testing error')

training_error_stoch_array = np.asarray(training_error_stoch_array)
testing_error_stoch_array = np.asarray(testing_error_stoch_array)
ax.scatter(range(1,len(training_error_stoch_array)+1),training_error_stoch_array[:,1],marker='@',
           label='training error')
ax.scatter(range(1,len(testing_error_stoch_array)+1),testing_error_stoch_array[:,1],marker='#',label='testing error')

handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, labels)
plt.title('Classification Error over iterations')
plt.ylabel('Classification Error')
plt.xlabel('Iteration')

In [None]:
%matplotlib nbagg
plt.clf()
plt.cla()

ax = plt.subplot(1,1,1)

# print w
x_plot=[w[0], 0]
y_plot=[w[1], 0]
ax.plot(x_plot,y_plot)

# print clf.coef_[0]
x_plot=[clf.coef_[0][0], 0]
y_plot=[clf.coef_[0][1], 0]
ax.plot(x_plot,y_plot)

ax.scatter((X[y==0])[:,0],(X[y==0])[:,1],marker='*')
ax.scatter((X[y==1])[:,0],(X[y==1])[:,1],marker='^')

handles, labels = ax.get_legend_handles_labels()
plt.title('Data, Scikit-learn Hyperplane, and Our Own Hyperplane')
plt.ylabel('y')
plt.xlabel('x')