# Линейный SVM "своими руками"

## Генерируем обучающую и тестовую выборку для экспериментов

In [200]:
from sklearn.cross_validation import train_test_split
from sklearn import datasets

X, y = datasets.make_classification(n_samples=10000, n_features=20, n_classes=2, n_informative=20, n_redundant=0,
                                    random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

print len(X), len(y)
print len(X_train)

10000 10000
8000


## Пишем свой класс для SVM

In [201]:
import numpy as np
import random
from random import randint

np.random.seed(42)
random.seed(42)


class MySVM(object):
    def __init__(self, C=1000):
        self.C = C # regularization constant

    # f(x) = <w,x> + w_0
    def f(self, x):
        return np.dot(self.w, x) + self.w0

    # a(x) = [f(x) > 0]
    def a(self, x):
        return 1 if self.f(x) > 0 else 0
    
    # predicting answers for X_test
    def predict(self, X_test):
        return np.array([self.a(x) for x in X_test])

    # l2-regularizator
    def reg(self):
        return 1.0 * sum(self.w ** 2) / (2.0 * self.C)

    # l2-regularizator derivative
    def der_reg(self):
        return self.w * (1.0 / self.C)

    # hinge loss
    def loss(self, x, answer):
        return max([0, 1 - answer * self.f(x)])

    # hinge loss derivative
    def der_loss(self, x, answer):
        return -1.0 if 1 - answer * self.f(x) > 0 else 0.0

    # fitting w and w_0 with SGD
    def fit(self, X_train, y_train):
        dim = len(X_train[0])
        self.w = np.random.rand(dim) # initial value for w
        self.w0 = np.random.randn() # initial value for w_0
        
        for k in range(1000):
            # simple heuristic for step size
            step = 0.3 * 0.99 ** k # 0.99 for slowly decreasing step size
            
            for i in range(20): #batch with the same step size
                # random example choise
                rand_index = randint(0, len(X_train) - 1) # generating random index
                x = X_train[rand_index]
                y = y_train[rand_index]
                # y = 0 or 1, let's map it to {-1, 1}
                y = 1 if y == 1 else -1

                
                # w update
                self.w -= (x * step * y * self.der_loss(x, y) + step * self.der_reg())

                # w_0 update
                self.w0 -= step * y * self.der_loss(x, y)

## Пробуем обучить наш классификатор и посмотреть на качество на тесте

In [202]:
model = MySVM()
model.fit(X_train, y_train)
print model.w, model.w0

[-0.09132423 -0.14740224 -0.00515776  0.12376355  0.03276435  0.28173385
  0.0022888   0.08676591 -0.14824715 -0.10875793  0.14366228  0.15504341
  0.21585587  0.09100913 -0.22686879  0.06355078  0.13570312  0.19187777
 -0.01498881  0.20334607] -0.00162054781595


In [203]:
predictions = model.predict(X_test)

In [204]:
print predictions

[1 0 0 ..., 1 1 0]


In [205]:
print y_test, len(y_test), sum(y_test)

[1 0 1 ..., 1 0 1] 2000 991


In [206]:
print len(predictions), sum(predictions)

2000 1035


Вычислим accuracy на нашем датасете:

In [207]:
print sum(predictions == y_test) / float(len(y_test))

0.805
