[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncerdan/HandsOnML/blob/master/Ch_05_SVM.ipynb)

# Linear SVM Classification

## Soft Margin Classification

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

# load the data
iris = datasets.load_iris()
X = iris['data'][:, (2, 3)] # petal length, petal width
y = (iris['target'] == 2).astype(np.float64)    # is iris virginica

In [2]:
# create a linear SVM Classifier that standardizes inputs
# C: controls penalty for margin violations (1 is smaller)
# loss: controls loss function
# LinearSVC regularizes the bias term so should first center about mean
#   StandardScaler() handles this

svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_svc', LinearSVC(C=1, loss='hinge'))
])

svm_clf.fit(X, y)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('linear_svc',
                 LinearSVC(C=1, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='hinge', max_iter=1000, multi_class='ovr',
                           penalty='l2', random_state=None, tol=0.0001,
                           verbose=0))],
         verbose=False)

In [3]:
# can use it to predict
svm_clf.predict([[5.5, 1.7]])

array([1.])

In [4]:
# could also instead use the plain SVC class with a linear kernel
#   NOTE: LinearSVC is much faster than SVC(kernel='linear')
from sklearn.svm import SVC

other_svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('other_linear_svc', SVC(kernel='linear', C=1))                     
])

other_svm_clf.fit(X, y)
other_svm_clf.predict([[5.5, 1.7]])

array([1.])

In [None]:
""" could also use SGD as follows:
from sklearn.linear_model import SGDClassifier

sgd_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_sgd', SGDClassifier(loss='hinge', alpha=1/(m*C)))
])

sgd_clf.fit(X, y)
sgd_clf.predict([[5.5, 1.7]])
"""

# Nonlinear SVM CLassification

In [6]:
# can use PolynomialFeatures with LinearSVC
from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

X, y = make_moons(n_samples=100, noise=0.15)

polynomial_svm_clf = Pipeline([
    ('poly_features', PolynomialFeatures(degree=3)),
    ('scaler', StandardScaler()),
    ('svm_clf', LinearSVC(C=10, loss='hinge'))
])

polynomial_svm_clf.fit(X, y)



Pipeline(memory=None,
         steps=[('poly_features',
                 PolynomialFeatures(degree=3, include_bias=True,
                                    interaction_only=False, order='C')),
                ('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svm_clf',
                 LinearSVC(C=10, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='hinge', max_iter=1000, multi_class='ovr',
                           penalty='l2', random_state=None, tol=0.0001,
                           verbose=0))],
         verbose=False)

## Polynomial Kernel

In [8]:
# this trick works, but when using larger polynomial degrees it becomes expensive.
# to prevent this it uses the Kernel Trick which allows you to do this without
# actually expanding all of the features. This is implemented with the SVC class
# coef0: control how much it is influcenced by high vs low degree values
from sklearn.svm import SVC
poly_kernel_svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('svm_clf', SVC(kernel='poly', degree=3, coef0=1, C=5))
])

poly_kernel_svm_clf.fit(X, y)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svm_clf',
                 SVC(C=5, break_ties=False, cache_size=200, class_weight=None,
                     coef0=1, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='poly', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

## Gaussian RBF Kernel

In [9]:
# runs Gaussian RBF similarity function using kernel trick
# gamma: controls width of bell curve (large gamma = narrower curve)
#   so can use this to regularize. large --> more fitting
#                                  small --> less fitting

rbf_kernerl_svm_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('svm_clf', SVC(kernel='rbf', gamma=5, C=0.001))
])

rbf_kernerl_svm_clf.fit(X, y)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svm_clf',
                 SVC(C=0.001, break_ties=False, cache_size=200,
                     class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma=5,
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

# SVM Regression

In [10]:
# for linear data
from sklearn.svm import LinearSVR

svm_reg = LinearSVR(epsilon=1.5)
svm_reg.fit(X, y)

LinearSVR(C=1.0, dual=True, epsilon=1.5, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [11]:
# kernelized for nonlinear data
from sklearn.svm import SVR

svm_poly_reg = SVR(kernel='poly', degree=2, C=100, epsilon=0.1)
svm_poly_reg.fit(X, y)

SVR(C=100, cache_size=200, coef0=0.0, degree=2, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

# Exercises

## 9.) Train an SVM Classifier on the MNIST dataset using one-vs-rest to make it multi-class

In [13]:
# first let's get the data
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, cache=True)

X = mnist['data']
y = mnist['target']

X_train = X[:60000]
y_train = y[:60000]
X_test = X[60000:]
y_test = y[60000:]

In [18]:
# scaling is important in SVM
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float32))
X_test_scaled = scaler.transform(X_test.astype(np.float32))

In [19]:
# let's try a RBF kernel first
from sklearn.svm import SVC

rbf_clf = SVC(kernel='rbf')

# only train on small subset for speed for now
rbf_clf.fit(X_train_scaled[:10000], y_train[:10000])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [20]:
# let's checkout its performance
from sklearn.metrics import accuracy_score
y_pred = rbf_clf.predict(X_train_scaled)
accuracy_score(y_train, y_pred)     # --> 94.5% is not too bad!

0.9455333333333333

In [23]:
# now lets do a random search on hyperparams to try and optimize
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, reciprocal

param_distrs = {'gamma': reciprocal(0.001, 0.1), 'C': uniform(1, 10)}
rand_search_cv = RandomizedSearchCV(rbf_clf, param_distrs, n_iter=10, verbose=2, cv=3)

# use very small sets to go faster
rand_search_cv.fit(X_train_scaled[:1000], y_train[:1000])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=7.679152436041923, gamma=0.01958133161609739 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ... C=7.679152436041923, gamma=0.01958133161609739, total=   1.3s
[CV] C=7.679152436041923, gamma=0.01958133161609739 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV] ... C=7.679152436041923, gamma=0.01958133161609739, total=   1.3s
[CV] C=7.679152436041923, gamma=0.01958133161609739 ..................
[CV] ... C=7.679152436041923, gamma=0.01958133161609739, total=   1.3s
[CV] C=4.100926758624821, gamma=0.03654242289779093 ..................
[CV] ... C=4.100926758624821, gamma=0.03654242289779093, total=   1.3s
[CV] C=4.100926758624821, gamma=0.03654242289779093 ..................
[CV] ... C=4.100926758624821, gamma=0.03654242289779093, total=   1.3s
[CV] C=4.100926758624821, gamma=0.03654242289779093 ..................
[CV] ... C=4.100926758624821, gamma=0.03654242289779093, total=   1.3s
[CV] C=8.509401151212021, gamma=0.07824535370623982 ..................
[CV] ... C=8.509401151212021, gamma=0.07824535370623982, total=   1.3s
[CV] C=8.509401151212021, gamma=0.07824535370623982 ..................
[CV] ... C=8.509401151212021, gamma=0.07824535370623982, total=   1.3s
[CV] C=8.509401151212021, gamma=0.07824535370623982 ..................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   35.7s finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb08f6191d0>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fb08f619668>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=2)

In [24]:
# let's see what we found
rand_search_cv.best_estimator_

SVC(C=4.935354011782927, break_ties=False, cache_size=200, class_weight=None,
    coef0=0.0, decision_function_shape='ovr', degree=3,
    gamma=0.0011264367733838313, kernel='rbf', max_iter=-1, probability=False,
    random_state=None, shrinking=True, tol=0.001, verbose=False)

In [27]:
# what was it's score on just the 1000 data points?
rand_search_cv.best_score_  # --> not too bad for such a limited dataset

0.8639987292681903

In [28]:
# now let's train it on the whole data set and see how it does
rand_search_cv.best_estimator_.fit(X_train_scaled, y_train)

SVC(C=4.935354011782927, break_ties=False, cache_size=200, class_weight=None,
    coef0=0.0, decision_function_shape='ovr', degree=3,
    gamma=0.0011264367733838313, kernel='rbf', max_iter=-1, probability=False,
    random_state=None, shrinking=True, tol=0.001, verbose=False)

In [30]:
# and let's check its accuracy
y_pred = rand_search_cv.best_estimator_.predict(X_train_scaled)
accuracy_score(y_train, y_pred)

0.9963166666666666

In [31]:
# seem's pretty good. now let's check it's testing accuracy
y_test_pred = rand_search_cv.best_estimator_.predict(X_test_scaled)
accuracy_score(y_test, y_test_pred)

0.9728

In [33]:
# 97% accuracy! That's pretty good! 
# It's lower than the 99% shown from the training score, so that means my
# model is probably slightly overfit -- but not too bad overall!