In [1]:
import numpy as np

from sklearn import datasets
from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.svm import LinearSVC, SVC, LinearSVR, SVR

# Linear SVM Classification

## Soft Margin Classification

In [2]:
iris = datasets.load_iris()

x = iris['data'][:, (2, 3)] # Select the petal length and the petal width
y = (iris['target'] == 2).astype(np.float64) # Select the class Iris Virginica

In [3]:
svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_svc', LinearSVC(C=1, loss='hinge'))
])

In [4]:
svm_clf.fit(x, y)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('linear_svc',
                 LinearSVC(C=1, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='hinge', max_iter=1000, multi_class='ovr',
                           penalty='l2', random_state=None, tol=0.0001,
                           verbose=0))],
         verbose=False)

In [5]:
svm_clf.predict([[5.5, 1.7]])

array([1.])

# Nonlinear SVM Classificaton

In [6]:
x, y = make_moons(n_samples=100, noise=0.15)

polynomial_svm_clf = Pipeline([
    ('poly_features', PolynomialFeatures(degree=3)),
    ('scaler', StandardScaler()),
    ('svm_clf', LinearSVC(C=10, loss='hinge'))
])

In [7]:
polynomial_svm_clf.fit(x, y)



Pipeline(memory=None,
         steps=[('poly_features',
                 PolynomialFeatures(degree=3, include_bias=True,
                                    interaction_only=False, order='C')),
                ('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svm_clf',
                 LinearSVC(C=10, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='hinge', max_iter=1000, multi_class='ovr',
                           penalty='l2', random_state=None, tol=0.0001,
                           verbose=0))],
         verbose=False)

In [8]:
y[45]

1

In [9]:
y_hat = polynomial_svm_clf.predict(x)

In [10]:
a = 0

for i in range(y.shape[0]):
  if y[i] != y_hat[i]:
    print(f'The value was different for i = {i}')
    a += 1

print(f'The total of mistakes was {a}')

The value was different for i = 44
The total of mistakes was 1


In [11]:
poly_kernel_svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('svm_clf', SVC(kernel='poly', degree=3, coef0=1, C=5))
])

poly_kernel_svm_clf.fit(x, y)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svm_clf',
                 SVC(C=5, break_ties=False, cache_size=200, class_weight=None,
                     coef0=1, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='poly', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

## Gaussian RBF Kernel

In [12]:
rbf_kernel_svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('svm_clf', SVC(kernel='rbf', gamma=5, C=0.001))
])

rbf_kernel_svm_clf.fit(x, y)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svm_clf',
                 SVC(C=0.001, break_ties=False, cache_size=200,
                     class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma=5,
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

# SVM Regression

In [13]:
svm_reg = LinearSVR(epsilon=1.5)
svm_reg.fit(x, y)

LinearSVR(C=1.0, dual=True, epsilon=1.5, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [15]:
svm_poly_reg = SVR(kernel='poly', degree=2, C=100, epsilon=0.1)
svm_poly_reg.fit(x, y)

SVR(C=100, cache_size=200, coef0=0.0, degree=2, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [17]:
svm_clf.decision_function

<function sklearn.pipeline.Pipeline.decision_function>

# Under the Hood

## Decision Function and Predictions

A linear SVM Classifier makes the predictions by computing the decision function $\textbf{w}^{\top}\textbf{x}+b = w_{1}b_{1}+...+w_{n}b_{n}+b$. If the result of this weighted sum, the predicted class $\hat{y}$ is possitive (1), otherwise the result is the negative class (0)

- Linear SVM Classifier Prediction
> $\hat{y}=\begin{cases}{0 \text{ if } \textbf{w}^{\top}\textbf{x}+b < 0}\\{1 \text{ if } \textbf{w}^{\top}\textbf{x}+b \ge 0}\end{cases}$

- Hard margin linear SVM Classifier objective
> $\underbrace{\text{minimize}}_{w,b}  \frac{1}{2}\textbf{w}^{\top}\textbf{w}$

- Soft margin linear SVM classifier objective
> $\underbrace{\text{minimize}}_{w,b,\zeta}  \frac{1}{2}\textbf{w}^{\top}\textbf{w}+C\sum\limits_{i=1}^{m}{\zeta^{(i)}}$


# Exercises

1. What is the fundamental idea behind Support Vector Machines?

> The fundamental idea behind the Support Vector Machine is to fit a street as wide as possible between two classes that are linealy separable. The objective is to fit the widest street possible.

2. What is a support vector?

> A support vector is the instance in the margin of the street. This vector works as the limit between two classes for Hard Margin Classification.

3. Why is it important to scale the inputs when using SVMs?

> Scaling the inputs for SVMs allows the algorithm to make the widest possible street, i.e., making the decision boundary wider.

4. Can an SVM classifier output a confidence score when it classifies an instance? What about a probability?

> 

5. Should you use the primal or the dual form of the SVM problem to train a model on a training set with millions of instances and hundreds of features?


6. Say you’ve trained an SVM classifier with an RBF kernel, but it seems to underfit the training set. Should you increase or decrease γ (gamma)? What about C?


7. How should you set the QP parameters (H, f, A, and b) to solve the soft margin linear SVM classifier problem using an off-the-shelf QP solver?

8. Train a LinearSVC on a linearly separable dataset. Then train an SVC and a
SGDClassifier on the same dataset. See if you can get them to produce roughly
the same model.

9. Train an SVM classifier on the MNIST dataset. Since SVM classifiers are binary classifiers, you will need to use one-versus-the-rest to classify all 10 digits. You may want to tune the hyperparameters using small validation sets to speed up the process. What accuracy can you reach?


10. Train an SVM regressor on the California housing dataset.