# Tuning Parameters of SVM with GridSearchCV

In [1]:
#Import scikit-learn dataset library
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm

import numpy as np
import pandas as pd



## Dataset

In [2]:
# loading the dataset
cancer = datasets.load_breast_cancer()
cancer['target_names']
print(cancer.DESCR)
# cancer

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [3]:
X = pd.DataFrame(data=cancer['data'], columns=cancer['feature_names'])
y = pd.Series(data=cancer['target'], name='target')

## Modeling

In [4]:
# split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=109) 

In [5]:
# make SVM classifier and train the model with train dataset
clf = svm.SVC(kernel='linear') # Linear Kernel
clf.fit(X_train, y_train)

SVC(kernel='linear')

In [6]:
# with train dataset, let's see how the model performs
y_pred = clf.predict(X_train)
clf.score(X_train, y_train), metrics.confusion_matrix(y_train, y_pred)

(0.9623115577889447,
 array([[139,  10],
        [  5, 244]]))

In [7]:
# now let's see with the test dataset
y_pred = clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9649122807017544


In [8]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95        63
           1       0.98      0.96      0.97       108

    accuracy                           0.96       171
   macro avg       0.96      0.97      0.96       171
weighted avg       0.97      0.96      0.97       171



## Tuning Parameters

In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
param_grid = {
    # regularization parameter. how much error is OK for us in
    # our model. gives you control over the trade-off btw the
    # decision boundary and misclassifications.
    'C': [0.1, 1, 10, 100],
    # influence of points on hyper-plane.
    # high gamma will consider nearest points only,
    # low gamma will consider farther points as well
    'gamma': [1, 0.1, 0.01],
    # function that transforms data form lower to high dimention
    # tries to make the data linearly seperatable.
    'kernel': ['rbf', 'sigmoid', 'linear']
}


In [11]:
grid = GridSearchCV(svm.SVC(), param_grid, cv=2, refit=True, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

Fitting 2 folds for each of 36 candidates, totalling 72 fits


GridSearchCV(cv=2, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01],
                         'kernel': ['rbf', 'sigmoid', 'linear']},
             verbose=2)

In [12]:
print(grid.best_estimator_)

SVC(C=100, gamma=1, kernel='linear')


In [13]:
grid.best_estimator_.score(X_test, y_test)

0.9707602339181286