In [40]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

Load the data

In [41]:
trainfile = 'data/sat.trn'
testfile = 'data/sat.tst'

colnames = [i for i in range(36)]
colnames.append('terrain')

train = pd.read_csv(trainfile, sep='\s+', names=colnames)
test = pd.read_csv(testfile, sep='\s+', names=colnames)

In [42]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,terrain
0,92,115,120,94,84,102,106,79,84,102,...,104,88,121,128,100,84,107,113,87,3
1,84,102,106,79,84,102,102,83,80,102,...,100,84,107,113,87,84,99,104,79,3
2,84,102,102,83,80,102,102,79,84,94,...,87,84,99,104,79,84,99,104,79,3
3,80,102,102,79,84,94,102,79,80,94,...,79,84,99,104,79,84,103,104,79,3
4,84,94,102,79,80,94,98,76,80,102,...,79,84,103,104,79,79,107,109,87,3


In [43]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,terrain
0,80,102,102,79,76,102,102,79,76,102,...,87,79,107,109,87,79,107,113,87,3
1,76,102,102,79,76,102,106,83,76,102,...,87,79,107,113,87,79,103,104,83,3
2,80,98,106,79,76,94,102,76,76,94,...,79,79,95,100,79,79,95,96,75,4
3,76,94,102,76,76,94,102,76,76,94,...,79,79,95,96,75,79,95,100,75,4
4,76,94,102,76,76,94,102,76,76,89,...,75,79,95,100,75,75,95,100,79,4


In [44]:
xcols = train.columns[:-1]
ycol = train.columns[-1]

In [45]:
train[xcols] = train[xcols].astype(float)
test[xcols] = test[xcols].astype(float)

In [46]:
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('svc', SVC())
])

In [47]:
coarse_params = {
    'svc__gamma': [0.01, 0.1, 1.0, 10.0],
    'svc__C': np.logspace(-3, 3, 7)
}

In [48]:
gs = GridSearchCV(pipeline, param_grid=coarse_params, cv=5)

In [49]:
%%time
gs.fit(train[xcols], train[ycol].values.ravel())

CPU times: user 3min 21s, sys: 512 ms, total: 3min 22s
Wall time: 3min 22s


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'svc__gamma': [0.01, 0.1, 1.0, 10.0], 'svc__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [50]:
gs.best_params_

{'svc__C': 1.0, 'svc__gamma': 0.1}

In [51]:
fine_params = {
    'svc__C': np.logspace(-1, 2, 10),
    'svc__gamma': np.logspace(-2, 0, 10)
}

In [52]:
%%time
gs2 = GridSearchCV(pipeline, param_grid=fine_params, cv=5)
gs2.fit(train[xcols], train[ycol].values.ravel())

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.82 µs
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs


In [53]:
gs2.best_params_

{'svc__C': 2.1544346900318834, 'svc__gamma': 0.0774263682681127}

In [54]:
gs2.score(test[xcols], test[ycol].values.ravel())

0.914

In [55]:
np.logspace(-1, 2, 10)

array([  0.1       ,   0.21544347,   0.46415888,   1.        ,
         2.15443469,   4.64158883,  10.        ,  21.5443469 ,
        46.41588834, 100.        ])

In [None]:
very_fine_params = {
    'svc__C': np.linspace(1.0, 4.64158883),
    'svc__gamma'
}