# A Practical Guide to Support Vector Classification

## Examples of the Proposed Procedure

### Package Installation

In [1]:
%%capture
%pip install -U libsvm-official
%pip install numpy
%pip install scipy

In [2]:
from libsvm.svmutil import *
import numpy as np

import io
import urllib.request

### Astroparticle Physics

- Load data

You can find data sets at [LIBSVM Data Sets](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/).

In [3]:
def load_url_libsvm(url, return_scipy=True):
    with io.TextIOWrapper(urllib.request.urlopen(url)) as r:
        return svm_read_problem(r, return_scipy=return_scipy)

svmguide1_train_url = 'https://www.csie.ntu.edu.tw/~cjlin/\
libsvmtools/datasets/binary/svmguide1'
svmguide1_test_url = 'https://www.csie.ntu.edu.tw/~cjlin/\
libsvmtools/datasets/binary/svmguide1.t'
y_train, x_train = load_url_libsvm(svmguide1_train_url)
y_test, x_test = load_url_libsvm(svmguide1_test_url)

- Original sets with default parameters

In [4]:
m = svm_train(y_train, x_train, "-q")
pred_labels, pred_metrics, pred_values = svm_predict(
    y_test, x_test, m
    )

Accuracy = 66.925% (2677/4000) (classification)


- Scaled sets with default parameters

In [5]:
%%capture
scale_param = csr_find_scale_param(x_train, lower=-1, upper=1)
x_train_scaled = csr_scale(x_train, scale_param)
x_test_scaled = csr_scale(x_test, scale_param)

In [6]:
m_scaled = svm_train(y_train, x_train_scaled, "-q")
pred_labels, pred_metrics, pred_values = svm_predict(
    y_test, x_test_scaled, m_scaled
    )

Accuracy = 96.15% (3846/4000) (classification)


- Scaled sets with parameter selection

In [None]:
def compute_acc(y_true, y_pred):
    return np.mean(y_true==y_pred)

evaluations = {
    "ACC": compute_acc
}

def grid_serach(y, x, n_folds=5,
                c_exp_space=[-10, 10, 5], g_exp_space=[-10, 10, 5],
                metric="ACC"):
    c_seq = np.linspace(*c_exp_space)
    g_seq = np.linspace(*g_exp_space)
    grid_space = [(c, g) for c in c_seq for g in g_seq]

    l = len(y)

    y_pred = np.zeros(l)

    permutation = np.random.permutation(l)
    index_per_fold = [
        permutation[int(i * l / n_folds):int((i+1) * l / n_folds)]
        for i in range(n_folds)
    ]

    best_score = 0
    best_params = best_params_info = ""

    for params in grid_space:
        for i in range(n_folds):
            valid_index = index_per_fold[i]
            train_index = np.concatenate(
                index_per_fold[:i] + index_per_fold[i+1:]
                )

            options = f"-c {2**params[0]} -g {2**params[1]} -q"
            m = svm_train(y[train_index], x[train_index], options)

            pred_labels, _, _ = svm_predict(
                y[valid_index], x[valid_index], m, "-q"
                )
            y_pred[valid_index] = np.array(pred_labels)

        cv_score = evaluations[metric](y, y_pred) * 100
        if cv_score > best_score:
            best_score = cv_score
            best_params = params
            best_params_info = \
                "(best c={0}, g={1}, rate={2:.4f})".format(
                    2**best_params[0], 2**best_params[1], best_score
                    )
        print("{0:>5} {1:>5} {2:.4f} {3}".format(
            *params, cv_score, best_params_info
        ))

    print("{0:>5} {1:>5} {2:.4f}".format(
        *best_params, best_score
    ))
    return best_params

best_params = grid_serach(y_train, x_train_scaled)

-10.0 -10.0 64.7459 (best c=0.0009765625, g=0.0009765625, rate=64.7459)
-10.0  -5.0 64.7459 (best c=0.0009765625, g=0.0009765625, rate=64.7459)
-10.0   0.0 64.7459 (best c=0.0009765625, g=0.0009765625, rate=64.7459)
-10.0   5.0 64.7459 (best c=0.0009765625, g=0.0009765625, rate=64.7459)
-10.0  10.0 64.7459 (best c=0.0009765625, g=0.0009765625, rate=64.7459)
 -5.0 -10.0 64.7459 (best c=0.0009765625, g=0.0009765625, rate=64.7459)
 -5.0  -5.0 64.9077 (best c=0.03125, g=0.03125, rate=64.9077)
 -5.0   0.0 93.2988 (best c=0.03125, g=1.0, rate=93.2988)
 -5.0   5.0 86.9861 (best c=0.03125, g=1.0, rate=93.2988)
 -5.0  10.0 64.7459 (best c=0.03125, g=1.0, rate=93.2988)
  0.0 -10.0 65.0049 (best c=0.03125, g=1.0, rate=93.2988)
  0.0  -5.0 93.6225 (best c=1.0, g=0.03125, rate=93.6225)
  0.0   0.0 96.6656 (best c=1.0, g=1.0, rate=96.6656)
  0.0   5.0 95.8886 (best c=1.0, g=1.0, rate=96.6656)
  0.0  10.0 73.7132 (best c=1.0, g=1.0, rate=96.6656)
  5.0 -10.0 93.2988 (best c=1.0, g=1.0, rate=96.6656)


- Use the best parameters $C$ and $\gamma$ to train on the entire training set and evaluate the model on the test set.

In [8]:
options = f"-c {2**best_params[0]} -g {2**best_params[1]} -q"
m_scaled = svm_train(y_train, x_train_scaled, options)
pred_labels, pred_metrics, pred_values = svm_predict(
    y_test, x_test_scaled, m_scaled
    )

Accuracy = 96.85% (3874/4000) (classification)


### Bioinformatics

- Load data

In [5]:
svmguide2_train_url = 'https://www.csie.ntu.edu.tw/~cjlin/\
libsvmtools/datasets/multiclass/svmguide2'
y_train, x_train = load_url_libsvm(svmguide2_train_url)

- Original sets with default parameters

In [7]:
m = svm_train(y_train, x_train, "-v 5 -q")

Cross Validation Accuracy = 56.5217%


- Scaled sets with default parameters

In [9]:
%%capture
scale_param = csr_find_scale_param(x_train, lower=-1, upper=1)
x_train_scaled = csr_scale(x_train, scale_param)

In [10]:
m_scaled = svm_train(y_train, x_train_scaled, "-v 5 -q")

Cross Validation Accuracy = 79.7954%


- Scaled sets with parameter selection

In [11]:
best_params = grid_serach(y_train, x_train_scaled)

-10.0 -10.0 56.5217 (best c=0.0009765625, g=0.0009765625, rate=56.5217)
-10.0  -5.0 56.5217 (best c=0.0009765625, g=0.0009765625, rate=56.5217)
-10.0   0.0 56.5217 (best c=0.0009765625, g=0.0009765625, rate=56.5217)
-10.0   5.0 56.5217 (best c=0.0009765625, g=0.0009765625, rate=56.5217)
-10.0  10.0 56.5217 (best c=0.0009765625, g=0.0009765625, rate=56.5217)
 -5.0 -10.0 56.5217 (best c=0.0009765625, g=0.0009765625, rate=56.5217)
 -5.0  -5.0 56.5217 (best c=0.0009765625, g=0.0009765625, rate=56.5217)
 -5.0   0.0 56.5217 (best c=0.0009765625, g=0.0009765625, rate=56.5217)
 -5.0   5.0 56.5217 (best c=0.0009765625, g=0.0009765625, rate=56.5217)
 -5.0  10.0 56.5217 (best c=0.0009765625, g=0.0009765625, rate=56.5217)
  0.0 -10.0 56.5217 (best c=0.0009765625, g=0.0009765625, rate=56.5217)
  0.0  -5.0 76.4706 (best c=1.0, g=0.03125, rate=76.4706)
  0.0   0.0 80.5627 (best c=1.0, g=1.0, rate=80.5627)
  0.0   5.0 56.5217 (best c=1.0, g=1.0, rate=80.5627)
  0.0  10.0 56.5217 (best c=1.0, g=1.0, ra

- Use the best parameters $C$ and $\gamma$ to train on the entire training set and evaluate the model with five fold cross-validation.

In [12]:
options = f"-c {2**best_params[0]} -g {2**best_params[1]} -v 5 -q"
m_scaled = svm_train(y_train, x_train_scaled, options)

Cross Validation Accuracy = 83.6317%


### Vehicle

- Load data

In [14]:
svmguide1_train_url = 'https://www.csie.ntu.edu.tw/~cjlin/\
libsvmtools/datasets/binary/svmguide3'
svmguide1_test_url = 'https://www.csie.ntu.edu.tw/~cjlin/\
libsvmtools/datasets/binary/svmguide3.t'
y_train, x_train = load_url_libsvm(svmguide1_train_url)
y_test, x_test = load_url_libsvm(svmguide1_test_url)

- Original sets with default parameters

In [15]:
m = svm_train(y_train, x_train, "-q")
pred_labels, pred_metrics, pred_values = svm_predict(
    y_test, x_test, m
    )

Accuracy = 2.43902% (1/41) (classification)


- Scaled sets with default parameters

In [16]:
%%capture
scale_param = csr_find_scale_param(x_train, lower=-1, upper=1)
x_train_scaled = csr_scale(x_train, scale_param)
x_test_scaled = csr_scale(x_test, scale_param)

In [17]:
m_scaled = svm_train(y_train, x_train_scaled, "-q")
pred_labels, pred_metrics, pred_values = svm_predict(
    y_test, x_test_scaled, m_scaled
    )

Accuracy = 12.1951% (5/41) (classification)


- Scaled sets with parameter selection

In [18]:
best_params = grid_serach(y_train, x_train_scaled)

-10.0 -10.0 76.1866 (best c=0.0009765625, g=0.0009765625, rate=76.1866)
-10.0  -5.0 76.1866 (best c=0.0009765625, g=0.0009765625, rate=76.1866)
-10.0   0.0 76.1866 (best c=0.0009765625, g=0.0009765625, rate=76.1866)
-10.0   5.0 76.1866 (best c=0.0009765625, g=0.0009765625, rate=76.1866)
-10.0  10.0 76.1866 (best c=0.0009765625, g=0.0009765625, rate=76.1866)
 -5.0 -10.0 76.1866 (best c=0.0009765625, g=0.0009765625, rate=76.1866)
 -5.0  -5.0 76.1866 (best c=0.0009765625, g=0.0009765625, rate=76.1866)
 -5.0   0.0 76.1866 (best c=0.0009765625, g=0.0009765625, rate=76.1866)
 -5.0   5.0 76.1866 (best c=0.0009765625, g=0.0009765625, rate=76.1866)
 -5.0  10.0 76.1866 (best c=0.0009765625, g=0.0009765625, rate=76.1866)
  0.0 -10.0 76.1866 (best c=0.0009765625, g=0.0009765625, rate=76.1866)
  0.0  -5.0 76.9107 (best c=1.0, g=0.03125, rate=76.9107)
  0.0   0.0 81.0137 (best c=1.0, g=1.0, rate=81.0137)
  0.0   5.0 76.1062 (best c=1.0, g=1.0, rate=81.0137)
  0.0  10.0 76.1866 (best c=1.0, g=1.0, ra

- Use the best parameters $C$ and $\gamma$ to train on the entire training set and evaluate the model on the test set.

In [19]:
options = f"-c {2**best_params[0]} -g {2**best_params[1]} -q"
m_scaled = svm_train(y_train, x_train_scaled, options)
pred_labels, pred_metrics, pred_values = svm_predict(
    y_test, x_test_scaled, m_scaled
    )

Accuracy = 82.9268% (34/41) (classification)
