In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn import preprocessing as pp
from sklearn import model_selection as ms
from sklearn.svm import SVC

Apply scikit-learn's SVM classifier with both an RBF and polynomial kernels to the (`scikitlearn`) cancer dataset by:

1. load and look at the data

2. normalize the data so that all features have mean 0 and variance 1 (this is easy with scikit learn's preprocessing package)

3. apply SVC with an RBF kernel.  Try all values of $C$ and $\gamma$ in the set $\{2^k | k =-5..+10\}$ (so 121 total pairs $(C,\gamma))$ and for each of these, do 3-fold cross validation.  Identify which pair gives the best cross-validation score.
 
4. Repeat #3 with a polynomial kernel.  Now you also need to search over possible choices of degree (say, 1...5) and coeff0 (say 0, 1, -1), for a total of 121 x 5 x 3 different models.

### 1)

In [2]:
cancer = datasets.load_breast_cancer()
data = cancer['data']
target = cancer['target']

The first three rows of data:

In [3]:
print(data[:3])

[[  1.79900000e+01   1.03800000e+01   1.22800000e+02   1.00100000e+03
    1.18400000e-01   2.77600000e-01   3.00100000e-01   1.47100000e-01
    2.41900000e-01   7.87100000e-02   1.09500000e+00   9.05300000e-01
    8.58900000e+00   1.53400000e+02   6.39900000e-03   4.90400000e-02
    5.37300000e-02   1.58700000e-02   3.00300000e-02   6.19300000e-03
    2.53800000e+01   1.73300000e+01   1.84600000e+02   2.01900000e+03
    1.62200000e-01   6.65600000e-01   7.11900000e-01   2.65400000e-01
    4.60100000e-01   1.18900000e-01]
 [  2.05700000e+01   1.77700000e+01   1.32900000e+02   1.32600000e+03
    8.47400000e-02   7.86400000e-02   8.69000000e-02   7.01700000e-02
    1.81200000e-01   5.66700000e-02   5.43500000e-01   7.33900000e-01
    3.39800000e+00   7.40800000e+01   5.22500000e-03   1.30800000e-02
    1.86000000e-02   1.34000000e-02   1.38900000e-02   3.53200000e-03
    2.49900000e+01   2.34100000e+01   1.58800000e+02   1.95600000e+03
    1.23800000e-01   1.86600000e-01   2.41600000e-01 

The first three corresponding entries of target values:

In [4]:
print(target[:3])

[0 0 0]


### 2)

In [5]:
for i in range(data.shape[1]):
    data[:,i] = pp.scale(data[:,i])

### 3)

In [6]:
scores = []
parameter_values = [2**k for k in range(-5,11)]
for c in parameter_values:
    for Gamma in parameter_values:
        svc = SVC(C = c, kernel = "rbf", gamma = Gamma)
        scores.append([c,Gamma,ms.cross_val_score(svc, data, target).mean()])

In [7]:
scores = sorted(scores, key=lambda x:x[2], reverse=True)
print("Best C:\t\t\t{}\nBest gamma:\t\t{}\nBest mean accuracy:\t{}".format(scores[0][0], scores[0][1], scores[0][2]))

Best C:			4
Best gamma:		0.03125
Best mean accuracy:	0.978919521025


###  4)

In [8]:
scores2 = []
parameter_values = [2**k for k in range(-5,5)]
for c in parameter_values:
    for Gamma in parameter_values:
        for Degree in range(1,6):
            for coef in range(-1,2):
                svc = SVC(C = c, kernel = "poly", gamma = Gamma, degree = Degree, coef0 = coef)
                scores2.append([c,Gamma,Degree, coef, ms.cross_val_score(svc, data, target).mean()])

In [9]:
scores2 = sorted(scores2, key=lambda x:x[4], reverse=True)
print("Best C:\t\t\t{}\nBest gamma:\t\t{}\nBest degree:\t\t{}\nBest Coef Value:\t{}\nBest mean accuracy:\t{}".format(scores2[0][0], 
                                                                                                                     scores2[0][1], 
                                                                                                                     scores2[0][2], 
                                                                                                                     scores2[0][3], 
                                                                                                                     scores2[0][4]))

Best C:			0.125
Best gamma:		0.25
Best degree:		2
Best Coef Value:	1
Best mean accuracy:	0.980683189455
