In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import tabpy_client
from sklearn.datasets import load_breast_cancer

In [2]:
data = load_breast_cancer()

X = data.data

y = data.target

columns = data.feature_names

encoder = preprocessing.LabelEncoder()

encoder.fit(data.target_names)

LabelEncoder()

In [3]:
params = {'C': 1.0, 'kernel': 'linear', 'probability': True, 'random_state': 12345}

scaler = StandardScaler()
scaler.fit(X)

X_scaled = scaler.transform(X)

estimator = SVC()

estimator.set_params(**params)

selector = RFE(estimator=estimator,n_features_to_select=2)

selector.fit(X_scaled,y)

X_scaled = selector.transform(X_scaled)
X = selector.transform(X)

estimator = selector.estimator_

print("Accuracy score (train): {0:.3f}".format(estimator.score(X_scaled, y)))

roc_auc = roc_auc_score(y, estimator.predict_proba(X_scaled)[:,1])

print("ROC AUC (train): {0:.3f}".format(roc_auc))

features_kept = columns[selector.support_]

print(features_kept)

Accuracy score (train): 0.949
ROC AUC (train): 0.985
['mean concavity' 'worst area']


In [4]:
df = pd.DataFrame(data=np.concatenate([X,encoder.inverse_transform(y).reshape((X.shape[0],1))],axis=1))
df.columns = features_kept.tolist() + ['Class']
df.to_csv('./Data.csv',index=False)

In [5]:
def ModelSelectionNestedCV(estimator,params,X,y,NUM_TRIALS=100):

    cum_train_aucc = []
    cum_val_aucc = []
    cum_train_auc = []
    cum_val_auc = []

    best_estimators = []
    best_params = []

    for iteration in range(NUM_TRIALS):

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y, shuffle=True)

        scaler = StandardScaler()
        scaler.fit(X_train)

        X_train = scaler.transform(X_train)
        X_val = scaler.transform(X_val)

        skf = StratifiedKFold(n_splits=5, shuffle=True)

        clf = GridSearchCV(estimator=estimator,param_grid=params,scoring='accuracy',n_jobs=-1,cv=skf,refit=True,verbose=0)

        clf.fit(X_train,y_train)

        estimator = clf.best_estimator_

        best_estimators.append(clf.best_estimator_)

        print(clf.best_params_)

        best_params.append(clf.best_params_)

        print("Accuracy score (train): {0:.3f}".format(estimator.score(X_train, y_train)))
        print("Accuracy score (validation): {0:.3f}".format(estimator.score(X_val, y_val)))

        roc_auc = dict()

        roc_auc[0] = roc_auc_score(y_train, estimator.predict_proba(X_train)[:,1])
        roc_auc[1] = roc_auc_score(y_val, estimator.predict_proba(X_val)[:,1])

        print("ROC AUC (train): {0:.3f}".format(roc_auc[0]))
        print("ROC AUC (validation): {0:.3f}".format(roc_auc[1]))

        cum_train_aucc.append(estimator.score(X_train, y_train))
        cum_val_aucc.append(estimator.score(X_val, y_val))

        cum_train_auc.append(roc_auc[0])
        cum_val_auc.append(roc_auc[1])

    print("Average accuracy score (train): {0:.3f}".format(np.mean(cum_train_aucc)))
    print("Average accuracy score (validation): {0:.3f}".format(np.mean(cum_val_aucc)))

    print("Standard deviation of accuracy score (train): {0:.3f}".format(np.std(cum_train_aucc)))
    print("Standard deviation of accuracy score (validation): {0:.3f}".format(np.std(cum_val_aucc)))

    print("95% CI of accuracy score (train): ",format(np.percentile(cum_train_aucc, (2.5, 97.5))))
    print("95% CI of accuracy score (validation): ",format(np.percentile(cum_val_aucc, (2.5, 97.5))))

    print("Average ROC AUC (train): {0:.3f}".format(np.mean(cum_train_auc)))
    print("Average ROC AUC (validation): {0:.3f}".format(np.mean(cum_val_auc)))

    print("Standard deviation of ROC AUC (train): {0:.3f}".format(np.std(cum_train_auc)))
    print("Standard deviation of ROC AUC (validation): {0:.3f}".format(np.std(cum_val_auc)))

    print("95% CI of ROC AUC (train): ",format(np.percentile(cum_train_auc, (2.5, 97.5))))
    print("95% CI of ROC AUC (validation): ",format(np.percentile(cum_val_auc, (2.5, 97.5))))

    return best_estimators, best_params, cum_train_aucc, cum_val_aucc, cum_train_auc, cum_val_auc

In [6]:
params = {'C': np.linspace(1e-4,1,1000).tolist(), 'kernel': ['linear'], 'probability': [True], 'random_state': [12345]}

estimator = SVC()

best_estimators, best_params, cum_train_aucc, cum_val_aucc, cum_train_auc, cum_val_auc = ModelSelectionNestedCV(estimator,params,X,y)

{'C': 0.16224594594594594, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.937
Accuracy score (validation): 0.965
ROC AUC (train): 0.984
ROC AUC (validation): 0.985
{'C': 0.5636072072072071, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.953
ROC AUC (train): 0.985
ROC AUC (validation): 0.985
{'C': 0.4705234234234234, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.947
ROC AUC (train): 0.983
ROC AUC (validation): 0.991
{'C': 0.3243918918918919, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.952
Accuracy score (validation): 0.942
ROC AUC (train): 0.982
ROC AUC (validation): 0.990
{'C': 0.10119099099099099, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.937
Accuracy score (validation): 0.918
ROC AUC (train): 0.



{'C': 0.19127207207207206, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.965
ROC AUC (train): 0.981
ROC AUC (validation): 0.993
{'C': 0.6136522522522522, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.955
Accuracy score (validation): 0.930
ROC AUC (train): 0.989
ROC AUC (validation): 0.977




{'C': 0.17125405405405403, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.950
Accuracy score (validation): 0.942
ROC AUC (train): 0.984
ROC AUC (validation): 0.982
{'C': 0.8178360360360359, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.967
Accuracy score (validation): 0.918
ROC AUC (train): 0.993
ROC AUC (validation): 0.961
{'C': 0.6576918918918918, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.959
ROC AUC (train): 0.982
ROC AUC (validation): 0.991
{'C': 0.4755279279279279, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.940
Accuracy score (validation): 0.965
ROC AUC (train): 0.981
ROC AUC (validation): 0.994
{'C': 0.7537783783783784, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.947
ROC AUC (train): 0.9



{'C': 0.454509009009009, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.940
Accuracy score (validation): 0.924
ROC AUC (train): 0.986
ROC AUC (validation): 0.982
{'C': 0.2543288288288288, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.947
ROC AUC (train): 0.981
ROC AUC (validation): 0.990
{'C': 0.757781981981982, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.940
Accuracy score (validation): 0.959
ROC AUC (train): 0.987
ROC AUC (validation): 0.984
{'C': 0.6566909909909909, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.947
Accuracy score (validation): 0.953
ROC AUC (train): 0.984
ROC AUC (validation): 0.985
{'C': 0.20628558558558557, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.952
Accuracy score (validation): 0.936
ROC AUC (train): 0.988



{'C': 0.10119099099099099, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.942
ROC AUC (train): 0.986
ROC AUC (validation): 0.981
{'C': 0.39945945945945943, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.947
ROC AUC (train): 0.983
ROC AUC (validation): 0.987
{'C': 0.46651981981981977, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.955
Accuracy score (validation): 0.942
ROC AUC (train): 0.982
ROC AUC (validation): 0.989




{'C': 0.353418018018018, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.936
ROC AUC (train): 0.983
ROC AUC (validation): 0.991
{'C': 0.44449999999999995, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.940
Accuracy score (validation): 0.936
ROC AUC (train): 0.983
ROC AUC (validation): 0.988
{'C': 0.03713333333333334, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.932
Accuracy score (validation): 0.906
ROC AUC (train): 0.984
ROC AUC (validation): 0.980
{'C': 0.30537477477477476, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.947
ROC AUC (train): 0.984
ROC AUC (validation): 0.983
{'C': 0.5115603603603603, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.940
Accuracy score (validation): 0.965
ROC AUC (train): 0.



{'C': 0.41347207207207204, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.947
Accuracy score (validation): 0.936
ROC AUC (train): 0.985
ROC AUC (validation): 0.983
{'C': 0.22630360360360358, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.947
Accuracy score (validation): 0.953
ROC AUC (train): 0.984
ROC AUC (validation): 0.986
{'C': 0.2473225225225225, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.930
ROC AUC (train): 0.985
ROC AUC (validation): 0.984




{'C': 0.4144729729729729, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.947
Accuracy score (validation): 0.953
ROC AUC (train): 0.986
ROC AUC (validation): 0.982
{'C': 0.21129009009009006, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.918
ROC AUC (train): 0.987
ROC AUC (validation): 0.978
{'C': 0.1101990990990991, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.912
ROC AUC (train): 0.986
ROC AUC (validation): 0.983
{'C': 0.14022612612612612, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.950
Accuracy score (validation): 0.936
ROC AUC (train): 0.985
ROC AUC (validation): 0.981
{'C': 0.9799819819819819, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.937
Accuracy score (validation): 0.953
ROC AUC (train): 0.



{'C': 0.22930630630630627, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.959
ROC AUC (train): 0.983
ROC AUC (validation): 0.989




{'C': 0.505554954954955, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.965
ROC AUC (train): 0.984
ROC AUC (validation): 0.990




{'C': 0.3323990990990991, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.950
Accuracy score (validation): 0.947
ROC AUC (train): 0.984
ROC AUC (validation): 0.985
{'C': 0.8398558558558558, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.952
Accuracy score (validation): 0.930
ROC AUC (train): 0.988
ROC AUC (validation): 0.980
{'C': 0.20728648648648645, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.937
Accuracy score (validation): 0.971
ROC AUC (train): 0.979
ROC AUC (validation): 0.997




{'C': 0.3754378378378378, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.947
ROC AUC (train): 0.981
ROC AUC (validation): 0.993
{'C': 0.16124504504504503, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.937
Accuracy score (validation): 0.965
ROC AUC (train): 0.984
ROC AUC (validation): 0.987
{'C': 0.21028918918918918, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.940
Accuracy score (validation): 0.953
ROC AUC (train): 0.983
ROC AUC (validation): 0.986
{'C': 0.4795315315315315, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.936
ROC AUC (train): 0.983
ROC AUC (validation): 0.991
{'C': 0.22330090090090088, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.935
Accuracy score (validation): 0.971
ROC AUC (train): 0



{'C': 0.8668801801801801, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.950
Accuracy score (validation): 0.947
ROC AUC (train): 0.982
ROC AUC (validation): 0.991
{'C': 0.1432288288288288, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.912
ROC AUC (train): 0.985
ROC AUC (validation): 0.986
{'C': 0.9679711711711712, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.957
Accuracy score (validation): 0.930
ROC AUC (train): 0.988
ROC AUC (validation): 0.981
{'C': 0.29436486486486485, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.924
ROC AUC (train): 0.988
ROC AUC (validation): 0.972
{'C': 0.6837153153153153, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.953
ROC AUC (train): 0.9



{'C': 0.16825135135135133, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.947
ROC AUC (train): 0.982
ROC AUC (validation): 0.987
{'C': 0.8178360360360359, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.942
ROC AUC (train): 0.982
ROC AUC (validation): 0.993
{'C': 0.8038234234234234, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.955
Accuracy score (validation): 0.942
ROC AUC (train): 0.986
ROC AUC (validation): 0.983
{'C': 0.232309009009009, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.947
ROC AUC (train): 0.986
ROC AUC (validation): 0.982
{'C': 0.9699729729729729, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.937
Accuracy score (validation): 0.971
ROC AUC (train): 0.98



{'C': 0.33940540540540537, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.959
ROC AUC (train): 0.981
ROC AUC (validation): 0.989
{'C': 0.2513261261261261, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.924
ROC AUC (train): 0.986
ROC AUC (validation): 0.978
{'C': 0.8909018018018018, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.952
Accuracy score (validation): 0.942
ROC AUC (train): 0.984
ROC AUC (validation): 0.989




{'C': 0.2433189189189189, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.952
Accuracy score (validation): 0.936
ROC AUC (train): 0.987
ROC AUC (validation): 0.979
{'C': 0.43949549549549544, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.962
Accuracy score (validation): 0.918
ROC AUC (train): 0.986
ROC AUC (validation): 0.981
Average accuracy score (train): 0.946
Average accuracy score (validation): 0.945
Standard deviation of accuracy score (train): 0.007
Standard deviation of accuracy score (validation): 0.016
95% CI of accuracy score (train):  [0.93335427 0.96111809]
95% CI of accuracy score (validation):  [0.9122807  0.97076023]
Average ROC AUC (train): 0.985
Average ROC AUC (validation): 0.984
Standard deviation of ROC AUC (train): 0.003
Standard deviation of ROC AUC (validation): 0.006
95% CI of ROC AUC (train):  [0.98055338 0.99032635]
95% CI of ROC AUC (validation):  [0.96966998 0.99365143]


In [7]:
Cs = []

for i in range(100):
    Cs.append(best_params[i]['C'])

print('Mean value of C parameter is: '+str(np.mean(Cs)))
print('STD of C parameter is: '+str(np.std(Cs)))

C_mean = np.mean(Cs)

params = {'C': C_mean, 'kernel': 'linear', 'probability': True, 'random_state': 12345}

scaler = StandardScaler()
scaler.fit(X)

X_scaled = scaler.transform(X)

estimator = SVC()

estimator.set_params(**params)

estimator.fit(X_scaled,y)

print("Accuracy score (train): {0:.3f}".format(estimator.score(X_scaled, y)))

roc_auc = roc_auc_score(y, estimator.predict_proba(X_scaled)[:,1])

print("ROC AUC (train): {0:.3f}".format(roc_auc))

Mean value of C parameter is: 0.4324291351351351
STD of C parameter is: 0.2516631843063981
Accuracy score (train): 0.949
ROC AUC (train): 0.985


In [8]:
# Connect to TabPy server using the client library
connection = tabpy_client.Client('http://localhost:9004/')

In [9]:
# The scoring function that will use the Gradient Boosting Classifier to classify new data points
def BreastCancerDiagnosis(mean_concavity, worst_area):
    X = np.column_stack([mean_concavity, worst_area])
    X = scaler.transform(X)
    return encoder.inverse_transform(estimator.predict(X)).tolist()

In [10]:
# Publish the SuggestDiagnosis function to TabPy server so it can be used from Tableau
# Using the name DiagnosticsDemo and a short description of what it does
connection.deploy('BreastCancerDiagnosis',
                  BreastCancerDiagnosis,
                  'Returns diagnosis suggestion based on ensemble model trained using Wisconsin Breast Cancer dataset',override=True)