# Compare SVC gridsearch with and without probability parameter
Based on built-in scikit-learn dataset (breast cancer classification)

In [92]:
from __future__ import division
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import make_scorer, roc_auc_score

In [93]:
np.__version__

'1.14.5'

### Fit gridsearches
Want to see whether setting `probability=True` or `probability=False` affects gridsearch for `SVC`

In [94]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=np.random.RandomState(2), shuffle=True, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((426L, 30L), (143L, 30L), (426L,), (143L,))


Using `probability=True` and scoring using `roc_auc`:

In [95]:
gs_t_ra = GridSearchCV(SVC(random_state=np.random.RandomState(0), probability=True, kernel='rbf'),
                       {'C': [1, 10], 'gamma': [0.001, 0.01]},
                       cv=StratifiedKFold(3, random_state=np.random.RandomState(1), shuffle=True),
                       scoring='roc_auc')
gs_t_ra.fit(X_train, y_train)

print('Best params: ' + str(gs_t_ra.best_params_))
print('Best score: ' + str(gs_t_ra.best_score_))

Best params: {'C': 1, 'gamma': 0.001}
Best score: 0.9515935269592254


Using `probability=False` and scoring using `roc_auc`:

In [96]:
gs_f_ra = GridSearchCV(SVC(random_state=np.random.RandomState(0), probability=False, kernel='rbf'),
                       {'C': [1, 10], 'gamma': [0.001, 0.01]},
                       cv=StratifiedKFold(3, random_state=np.random.RandomState(1), shuffle=True),
                       scoring='roc_auc')
gs_f_ra.fit(X_train, y_train)

print('Best params: ' + str(gs_f_ra.best_params_))
print('Best score: ' + str(gs_f_ra.best_score_))

Best params: {'C': 1, 'gamma': 0.001}
Best score: 0.9515935269592254


Using `probability=True` and scoring using `make_scorer(roc_auc_score)`:

In [97]:
gs_t_ms = GridSearchCV(SVC(random_state=np.random.RandomState(0), probability=True, kernel='rbf'),
                       {'C': [1, 10], 'gamma': [0.001, 0.01]},
                       cv=StratifiedKFold(3, random_state=np.random.RandomState(1), shuffle=True),
                       scoring=make_scorer(roc_auc_score))
gs_t_ms.fit(X_train, y_train)

print('Best params: ' + str(gs_t_ms.best_params_))
print('Best score: ' + str(gs_t_ms.best_score_))

Best params: {'C': 10, 'gamma': 0.001}
Best score: 0.9191929898947071


Using `probability=False` and scoring using `make_scorer(roc_auc_score)`:

In [98]:
gs_f_ms = GridSearchCV(SVC(random_state=np.random.RandomState(0), probability=False, kernel='rbf'),
                       {'C': [1, 10], 'gamma': [0.001, 0.01]},
                       cv=StratifiedKFold(3, random_state=np.random.RandomState(1), shuffle=True),
                       scoring=make_scorer(roc_auc_score))
gs_f_ms.fit(X_train, y_train)

print('Best params: ' + str(gs_f_ms.best_params_))
print('Best score: ' + str(gs_f_ms.best_score_))

Best params: {'C': 10, 'gamma': 0.001}
Best score: 0.9191929898947071


### Interrogate fitted models
Setting `probability` to `True` or `False` does not affect the score produced by the model. However, choice of `scoring` does affect the score produced. Conclude that gridsearch is optimising over 

In [99]:
print(gs_t_ra.predict(X_test)[0:5])
print(gs_t_ra.decision_function(X_test)[0:5])
try:
    print(gs_t_ra.predict_proba(X_test)[0:5, 1])
except AttributeError:
    print('Cannot run predict_proba')

[1 0 0 1 0]
[ 1.13762538 -0.7101496  -0.6853815   0.86097811 -0.9264004 ]
[0.98396355 0.06502408 0.07077458 0.95697885 0.03130039]


In [100]:
print(gs_f_ra.predict(X_test)[0:5])
print(gs_f_ra.decision_function(X_test)[0:5])
try:
    print(gs_f_ra.predict_proba(X_test)[0:5, 1])
except AttributeError:
    print('Cannot run predict_proba')

[1 0 0 1 0]
[ 1.13762538 -0.7101496  -0.6853815   0.86097811 -0.9264004 ]
Cannot run predict_proba


In [101]:
print(gs_t_ms.predict(X_test)[0:5])
print(gs_t_ms.decision_function(X_test)[0:5])
try:
    print(gs_t_ms.predict_proba(X_test)[0:5, 1])
except AttributeError:
    print('Cannot run predict_proba')

[1 0 0 1 0]
[ 1.10563526 -0.6906429  -0.98790694  1.0772863  -0.68790442]
[0.9696467  0.09933435 0.04280473 0.96690312 0.10011013]


In [102]:
print(gs_f_ms.predict(X_test)[0:5])
print(gs_f_ms.decision_function(X_test)[0:5])
try:
    print(gs_f_ms.predict_proba(X_test)[0:5, 1])
except AttributeError:
    print('Cannot run predict_proba')

[1 0 0 1 0]
[ 1.10563526 -0.6906429  -0.98790694  1.0772863  -0.68790442]
Cannot run predict_proba


### Conclusion
The `probability` argument in `SVC` is independent of whether a correct tuning with AUROC can be performed.

The `decision_function()` in `SVC` will evaluate the same regardless of `probability`