In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import tabpy_client
from sklearn.datasets import load_breast_cancer

In [2]:
data = load_breast_cancer()

X = data.data

y = data.target

columns = data.feature_names

encoder = preprocessing.LabelEncoder()

encoder.fit(data.target_names)

LabelEncoder()

In [3]:
df = pd.DataFrame(data=np.concatenate([X,encoder.inverse_transform(y).reshape((X.shape[0],1))],axis=1))
df.columns = data.feature_names.tolist() + ['Class']
df.to_csv('./Data.csv',index=False)

In [4]:
params = {'C': 1.0, 'kernel': 'linear', 'probability': True, 'random_state': 12345}

scaler = StandardScaler()
scaler.fit(X)

X_scaled = scaler.transform(X)

estimator = SVC()

estimator.set_params(**params)

selector = RFE(estimator=estimator,n_features_to_select=2)

selector.fit(X_scaled,y)

X_scaled = selector.transform(X_scaled)
X = selector.transform(X)

estimator = selector.estimator_

print("Accuracy score (train): {0:.3f}".format(estimator.score(X_scaled, y)))

roc_auc = roc_auc_score(y, estimator.predict_proba(X_scaled)[:,1])

print("ROC AUC (train): {0:.3f}".format(roc_auc))

features_kept = columns[selector.support_]

print(features_kept)

Accuracy score (train): 0.949
ROC AUC (train): 0.985
['mean concavity' 'worst area']


In [5]:
def ModelSelectionNestedCV(estimator,params,X,y,NUM_TRIALS=100):

    cum_train_aucc = []
    cum_val_aucc = []
    cum_train_auc = []
    cum_val_auc = []

    best_estimators = []
    best_params = []

    for iteration in range(NUM_TRIALS):

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y, shuffle=True)

        scaler = StandardScaler()
        scaler.fit(X_train)

        X_train = scaler.transform(X_train)
        X_val = scaler.transform(X_val)

        skf = StratifiedKFold(n_splits=5, shuffle=True)

        clf = GridSearchCV(estimator=estimator,param_grid=params,scoring='accuracy',n_jobs=-1,cv=skf,refit=True,verbose=0)

        clf.fit(X_train,y_train)

        estimator = clf.best_estimator_

        best_estimators.append(clf.best_estimator_)

        print(clf.best_params_)

        best_params.append(clf.best_params_)

        print("Accuracy score (train): {0:.3f}".format(estimator.score(X_train, y_train)))
        print("Accuracy score (validation): {0:.3f}".format(estimator.score(X_val, y_val)))

        roc_auc = dict()

        roc_auc[0] = roc_auc_score(y_train, estimator.predict_proba(X_train)[:,1])
        roc_auc[1] = roc_auc_score(y_val, estimator.predict_proba(X_val)[:,1])

        print("ROC AUC (train): {0:.3f}".format(roc_auc[0]))
        print("ROC AUC (validation): {0:.3f}".format(roc_auc[1]))

        cum_train_aucc.append(estimator.score(X_train, y_train))
        cum_val_aucc.append(estimator.score(X_val, y_val))

        cum_train_auc.append(roc_auc[0])
        cum_val_auc.append(roc_auc[1])

    print("Average accuracy score (train): {0:.3f}".format(np.mean(cum_train_aucc)))
    print("Average accuracy score (validation): {0:.3f}".format(np.mean(cum_val_aucc)))

    print("Standard deviation of accuracy score (train): {0:.3f}".format(np.std(cum_train_aucc)))
    print("Standard deviation of accuracy score (validation): {0:.3f}".format(np.std(cum_val_aucc)))

    print("95% CI of accuracy score (train): ",format(np.percentile(cum_train_aucc, (2.5, 97.5))))
    print("95% CI of accuracy score (validation): ",format(np.percentile(cum_val_aucc, (2.5, 97.5))))

    print("Average ROC AUC (train): {0:.3f}".format(np.mean(cum_train_auc)))
    print("Average ROC AUC (validation): {0:.3f}".format(np.mean(cum_val_auc)))

    print("Standard deviation of ROC AUC (train): {0:.3f}".format(np.std(cum_train_auc)))
    print("Standard deviation of ROC AUC (validation): {0:.3f}".format(np.std(cum_val_auc)))

    print("95% CI of ROC AUC (train): ",format(np.percentile(cum_train_auc, (2.5, 97.5))))
    print("95% CI of ROC AUC (validation): ",format(np.percentile(cum_val_auc, (2.5, 97.5))))

    return best_estimators, best_params, cum_train_aucc, cum_val_aucc, cum_train_auc, cum_val_auc

In [6]:
params = {'C': np.linspace(1e-4,1,1000).tolist(), 'kernel': ['linear'], 'probability': [True], 'random_state': [12345]}
NUM_TRIALS=10

estimator = SVC()

best_estimators, best_params, cum_train_aucc, cum_val_aucc, cum_train_auc, cum_val_auc = ModelSelectionNestedCV(estimator,params,X,y,NUM_TRIALS)

{'C': 0.39645675675675673, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.959
ROC AUC (train): 0.982
ROC AUC (validation): 0.993
{'C': 0.5716144144144144, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.950
Accuracy score (validation): 0.953
ROC AUC (train): 0.984
ROC AUC (validation): 0.987
{'C': 0.3013711711711711, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.942
ROC AUC (train): 0.982
ROC AUC (validation): 0.988
{'C': 0.43248918918918916, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.959
ROC AUC (train): 0.986
ROC AUC (validation): 0.979
{'C': 0.23330990990990988, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.955
Accuracy score (validation): 0.936
ROC AUC (train): 0



{'C': 0.40746666666666664, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.945
Accuracy score (validation): 0.953
ROC AUC (train): 0.986
ROC AUC (validation): 0.984
{'C': 0.6246621621621621, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.940
Accuracy score (validation): 0.959
ROC AUC (train): 0.984
ROC AUC (validation): 0.987




{'C': 0.22229999999999997, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.957
Accuracy score (validation): 0.918
ROC AUC (train): 0.985
ROC AUC (validation): 0.979
{'C': 0.39945945945945943, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.942
Accuracy score (validation): 0.947
ROC AUC (train): 0.986
ROC AUC (validation): 0.982
{'C': 0.6837153153153153, 'kernel': 'linear', 'probability': True, 'random_state': 12345}
Accuracy score (train): 0.937
Accuracy score (validation): 0.953
ROC AUC (train): 0.980
ROC AUC (validation): 0.996
Average accuracy score (train): 0.945
Average accuracy score (validation): 0.948
Standard deviation of accuracy score (train): 0.006
Standard deviation of accuracy score (validation): 0.012
95% CI of accuracy score (train):  [0.93775126 0.95672111]
95% CI of accuracy score (validation):  [0.92207602 0.95906433]
Average ROC AUC (train): 0.984
Average ROC AUC (validation): 0.986


In [7]:
Cs = []

for i in range(NUM_TRIALS):
    Cs.append(best_params[i]['C'])

print('Mean value of C parameter is: '+str(np.mean(Cs)))
print('STD of C parameter is: '+str(np.std(Cs)))

C_mean = np.mean(Cs)

params = {'C': C_mean, 'kernel': 'linear', 'probability': True, 'random_state': 12345}

scaler = StandardScaler()
scaler.fit(X)

X_scaled = scaler.transform(X)

estimator = SVC()

estimator.set_params(**params)

estimator.fit(X_scaled,y)

print("Accuracy score (train): {0:.3f}".format(estimator.score(X_scaled, y)))

roc_auc = roc_auc_score(y, estimator.predict_proba(X_scaled)[:,1])

print("ROC AUC (train): {0:.3f}".format(roc_auc))

Mean value of C parameter is: 0.42728450450450445
STD of C parameter is: 0.14939518061788776
Accuracy score (train): 0.947
ROC AUC (train): 0.985


In [8]:
# Connect to TabPy server using the client library
connection = tabpy_client.Client('http://localhost:9004/')

In [9]:
# The scoring function that will use the Gradient Boosting Classifier to classify new data points
def BreastCancerDiagnosis(mean_concavity, worst_area):
    X = np.column_stack([mean_concavity, worst_area])
    X = scaler.transform(X)
    return encoder.inverse_transform(estimator.predict(X)).tolist()

In [10]:
# Publish the SuggestDiagnosis function to TabPy server so it can be used from Tableau
# Using the name DiagnosticsDemo and a short description of what it does
connection.deploy('BreastCancerDiagnosis',
                  BreastCancerDiagnosis,
                  'Returns diagnosis suggestion based on ensemble model trained using Wisconsin Breast Cancer dataset',override=True)

In [11]:
from sklearn.cluster import KMeans

In [12]:
def kmeans(var1,var2,kcluster):
    X = np.column_stack([var1,var2])
    X = StandardScaler().fit_transform(X)
    kmeans = KMeans(n_clusters=int(kcluster[0]), random_state=0).fit(X)
    return kmeans.labels_.tolist()

In [13]:
connection.deploy('Kmeans-clust',kmeans,'Returns the clustering label for each individual',override=True)

In [14]:
def pca1(_arg1,_arg2,_arg3):
    X = np.column_stack([_arg1,_arg2,_arg3])
    X = StandardScaler().fit_transform(X)
    pca = PCA(n_components=2)
    pca.fit(X)
    X = pca.transform(X)

    return X[:,0].tolist()

In [15]:
def pca2(_arg1,_arg2,_arg3):
    X = np.column_stack([_arg1,_arg2,_arg3])
    X = StandardScaler().fit_transform(X)
    pca = PCA(n_components=2)
    pca.fit(X)
    X = pca.transform(X)

    return X[:,1].tolist()

In [16]:
connection.deploy('pca1',pca1,'Returns PCA coordinate 1',override=True)
connection.deploy('pca2',pca2,'Returns PCA coordinate 2',override=True)