In [31]:
import pandas as pd
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import time

In [2]:
data = pd.read_csv('data.csv')
data = data.drop('Unnamed: 32', 1)
# data

In [3]:
#Simple statistics on dataset
# print(len(data[data.diagnosis=='M']))
# print(len(data[data.diagnosis=='B']))
# print(data.columns)
# data.describe()

In [4]:
X = data.drop(['id', 'diagnosis'], 1)
Y = data.diagnosis
print(X.columns)

x_in, x_out, y_in, y_out = train_test_split(X, Y, train_size=.75)

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')


In [5]:
#Simple Support Vector Machine. No parameter tuning and no cross validation
svc_model = SVC()
svc_model.fit(x_in, y_in)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [6]:
svc_model.score(x_out, y_out)

0.58741258741258739

# Parameter Optimization

## Grid search

In [22]:
#Grid search
svc_parameters = {'C': np.arange(.5, 1.5, .1),
 'cache_size': [200],
 'class_weight': [None],
 'coef0': [0.0],
 'decision_function_shape': [None],
 'degree': [3],
 'gamma': ['auto'],
#  'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
 'kernel': ['linear'],
 'max_iter': [-1],
 'probability': [False],
 'random_state': [None],
 'shrinking': [True],
 'tol': [0.001],
 'verbose': [False]}
svc_model = SVC()
gs = GridSearchCV(svc_model, svc_parameters, cv=3)


In [23]:
gs.fit(X, Y)

GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([ 0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,  1.2,  1.3,  1.4]), 'cache_size': [200], 'class_weight': [None], 'coef0': [0.0], 'decision_function_shape': [None], 'degree': [3], 'gamma': ['auto'], 'kernel': ['linear'], 'max_iter': [-1], 'probability': [False], 'random_state': [None], 'shrinking': [True], 'tol': [0.001], 'verbose': [False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [24]:
print(gs.best_score_)
print(gs.best_params_)

0.954305799649
{'C': 1.2999999999999998, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': None, 'degree': 3, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


## Randomized Search

In [27]:
rs = RandomizedSearchCV(svc_model, svc_parameters, cv=3)

In [29]:
rs.fit(X, Y)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'C': array([ 0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,  1.2,  1.3,  1.4]), 'cache_size': [200], 'class_weight': [None], 'coef0': [0.0], 'decision_function_shape': [None], 'degree': [3], 'gamma': ['auto'], 'kernel': ['linear'], 'max_iter': [-1], 'probability': [False], 'random_state': [None], 'shrinking': [True], 'tol': [0.001], 'verbose': [False]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [30]:
print(rs.best_score_)
print(rs.best_params_)

0.954305799649
{'verbose': False, 'tol': 0.001, 'shrinking': True, 'random_state': None, 'probability': False, 'max_iter': -1, 'kernel': 'linear', 'gamma': 'auto', 'degree': 3, 'decision_function_shape': None, 'coef0': 0.0, 'class_weight': None, 'cache_size': 200, 'C': 1.2999999999999998}
