In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pylab as pl
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from __future__ import print_function
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score

In [2]:
dfA = pd.read_csv("wbc.csv")
dfA=dfA.replace('?',np.nan)
dfA['bare_nuclei'] = pd.to_numeric(dfA.bare_nuclei.fillna(dfA.bare_nuclei.median()))

In [3]:
X = dfA.drop([dfA.columns[-1]], axis=1)
y = dfA.jenis.map({2: 0, 4: 1})

In [4]:
RAND_SEED_SPLIT = RAND_SEED_TUNING = 84

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33, random_state=RAND_SEED_SPLIT)

In [1]:
svc=SVC()



In [7]:
C_range= [2**i for i in [-5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15]]
gamma_range= [2**i for i in [-15, -13, -11, -9, -7, -5, -3, -1, 1, 3]]

In [6]:
def cost_breast_cancer(y_true, y_pred):
    CM = confusion_matrix(y_true, y_pred)
    FN = CM[1][0]
    FP = CM[0][1]
    costBC=((-1*((228.35*FP)+(2850000*FN))))
    return costBC

In [7]:
def specificity(y_true, y_pred):
    CM = confusion_matrix(y_true, y_pred)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    nilai=(TN/(FP+TN))
    return nilai

In [8]:
def output_score(y_true, y_pred):
    CM = confusion_matrix(y_true, y_pred)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    specificity=(TN/(FP+TN))
    sensitivity=(TP/(TP+FN))
    accuracy=((TP+TN)/(TP+FP+TN+FN))
    costBC=((-1*((228.35*FP)+(2850000*FN))))
    return [specificity, sensitivity, accuracy, costBC]

In [9]:
my_custom_scorer=make_scorer(cost_breast_cancer, greater_is_better=True)
score_specificity=make_scorer(specificity, greater_is_better=True)
output_scoring=make_scorer(output_score, greater_is_better=True)

In [11]:
print(__doc__)

tuned_parameters = [{'kernel': ['rbf'], 'gamma': gamma_range,
                     'C': C_range},]

scores = [my_custom_scorer]

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf2 = GridSearchCV(svc, tuned_parameters, cv=10,
                       scoring=my_custom_scorer)
    clf2.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf2.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf2.cv_results_['mean_test_score']
    stds = clf2.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf2.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()


Automatically created module for IPython interactive environment
# Tuning hyper-parameters for make_scorer(cost_breast_cancer)

Best parameters set found on development set:

{'gamma': 0.125, 'kernel': 'rbf', 'C': 0.5}

Grid scores on development set:

-47329487.179 (+/-2784178.556) for {'gamma': 3.0517578125e-05, 'kernel': 'rbf', 'C': 0.03125}
-47329487.179 (+/-2784178.556) for {'gamma': 0.0001220703125, 'kernel': 'rbf', 'C': 0.03125}
-44771794.872 (+/-4444027.676) for {'gamma': 0.00048828125, 'kernel': 'rbf', 'C': 0.03125}
-5164239.672 (+/-6631463.620) for {'gamma': 0.001953125, 'kernel': 'rbf', 'C': 0.03125}
-1717536.042 (+/-2789393.112) for {'gamma': 0.0078125, 'kernel': 'rbf', 'C': 0.03125}
-852976.889 (+/-2610192.197) for {'gamma': 0.03125, 'kernel': 'rbf', 'C': 0.03125}
-641.137 (+/-860.823) for {'gamma': 0.125, 'kernel': 'rbf', 'C': 0.03125}
-47329487.179 (+/-2784178.556) for {'gamma': 0.5, 'kernel': 'rbf', 'C': 0.03125}
-47329487.179 (+/-2784178.556) for {'gamma': 2, 'kernel':

In [12]:
print("Best score cost obtained: {0}".format(clf2.best_score_))
print("Parameters:")
for key, value in clf2.best_params_.items():
    print("\t{}: {}".format(key, value))

Best score cost obtained: -595.2713675213676
Parameters:
	gamma: 0.125
	kernel: rbf
	C: 0.5


In [13]:
clf2.best_estimator_

SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.125, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
bestSVC=SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.125, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [13]:
bestSVC.fit(X_train, y_train)

SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.125, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
scores1=cross_val_score(bestSVC,X_train,y_train,cv=10,scoring= output_scoring)

ValueError: scoring must return a number, got [0.90322580645161288, 1.0, 0.9375, -685.04999999999995] (<class 'list'>) instead.

In [17]:
print("Score cost: %0.4f (+/- %0.4f)" % (scores1.mean(), scores1.std() * 2))

Score cost: 0.9139 (+/- 0.1163)


In [18]:
scores2=cross_val_score(bestSVC,X_train,y_train,cv=10,scoring=my_custom_scorer)

In [19]:
print("Score cost: %0.4f (+/- %0.4f)" % (scores2.mean(), scores2.std() * 2))

Score cost: -593.7100 (+/- 796.2837)


In [20]:
scores3=cross_val_score(bestSVC,X_train,y_train,cv=10,scoring='recall')

In [21]:
print("Score cost: %0.4f (+/- %0.4f)" % (scores3.mean(), scores3.std() * 2))

Score cost: 1.0000 (+/- 0.0000)


In [22]:
scores4=cross_val_score(bestSVC,X_train,y_train,cv=10,scoring='accuracy')

In [23]:
print("Score cost: %0.4f (+/- %0.4f)" % (scores4.mean(), scores4.std() * 2))

Score cost: 0.9446 (+/- 0.0743)


In [24]:
from xgboost import XGBClassifier



In [25]:
from sklearn.model_selection import RandomizedSearchCV

In [26]:
from sklearn.cross_validation import StratifiedKFold

In [27]:
cv = StratifiedKFold(y_train, n_folds=10, shuffle=True, random_state=RAND_SEED_SPLIT)

In [28]:
classifier = XGBClassifier()

In [31]:
print(__doc__)

# Set the parameters by cross-validation
tuned_parameters ={'n_estimators': [10,20,30,50],
                    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.],
                    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.],
                    'reg_alpha': [0., 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1., 5., 10.],
                    'reg_lambda': [0., 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1., 5., 10.],
                    'scale_pos_weight': np.linspace(1., (y_train == 2).sum() / y_train.sum(), 10)
                   }    

                  
params_fixed = {
    'objective': 'binary:logistic',
    'silent': 1
}
scores = [my_custom_scorer]

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = RandomizedSearchCV(estimator=XGBClassifier(**params_fixed, seed=RAND_SEED_TUNING),
    param_distributions=tuned_parameters, cv=cv, scoring=my_custom_scorer)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

Automatically created module for IPython interactive environment
# Tuning hyper-parameters for make_scorer(cost_breast_cancer)

Best parameters set found on development set:

{'n_estimators': 10, 'reg_lambda': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.5, 'scale_pos_weight': 1.0, 'reg_alpha': 0.001}

Grid scores on development set:

-1711445.323 (+/-3784255.368) for {'n_estimators': 10, 'reg_lambda': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.5, 'scale_pos_weight': 1.0, 'reg_alpha': 0.001}
-1991528.639 (+/-4444425.729) for {'n_estimators': 20, 'reg_lambda': 1.0, 'subsample': 0.5, 'colsample_bytree': 0.5, 'scale_pos_weight': 0.66666666666666674, 'reg_alpha': 5.0}
-7697549.585 (+/-8096019.022) for {'n_estimators': 30, 'reg_lambda': 0.0, 'subsample': 0.8, 'colsample_bytree': 0.6, 'scale_pos_weight': 0.11111111111111116, 'reg_alpha': 0.005}
-2277746.587 (+/-3426531.765) for {'n_estimators': 50, 'reg_lambda': 0.001, 'subsample': 0.7, 'colsample_bytree': 0.8, 'scale_pos_weight': 1.0, '

In [32]:
print("Best score cost obtained: {0}".format(clf.best_score_))
print("Parameters:")
for key, value in clf.best_params_.items():
    print("\t{}: {}".format(key, value))

Best score cost obtained: -1711445.3228632482
Parameters:
	n_estimators: 10
	reg_lambda: 0.01
	subsample: 0.7
	colsample_bytree: 0.5
	scale_pos_weight: 1.0
	reg_alpha: 0.001


In [33]:
clf.best_estimator_

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10, nthread=-1,
       objective='binary:logistic', reg_alpha=0.001, reg_lambda=0.01,
       scale_pos_weight=1.0, seed=84, silent=1, subsample=0.7)

In [34]:
bestXGB=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10, nthread=-1,
       objective='binary:logistic', reg_alpha=0.001, reg_lambda=0.01,
       scale_pos_weight=1.0, seed=84, silent=1, subsample=0.7)

In [35]:
bestXGB.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10, nthread=-1,
       objective='binary:logistic', reg_alpha=0.001, reg_lambda=0.01,
       scale_pos_weight=1.0, seed=84, silent=1, subsample=0.7)

In [36]:
scores1=cross_val_score(bestXGB,X_train,y_train,cv=cv,scoring=score_specificity)
print("Specificity: %0.4f (+/- %0.4f)" % (scores1.mean(), scores1.std() * 2))
scores2=cross_val_score(bestXGB,X_train,y_train,cv=cv,scoring='accuracy')
print("Accuracy: %0.4f (+/- %0.4f)" % (scores2.mean(), scores2.std() * 2))
scores3=cross_val_score(bestXGB,X_train,y_train,cv=cv,scoring='recall')
print("Recall: %0.4f (+/- %0.4f)" % (scores3.mean(), scores3.std() * 2))
scores4=cross_val_score(bestXGB,X_train,y_train,cv=cv,scoring=my_custom_scorer)
print("Score cost: %0.4f (+/- %0.4f)" % (scores4.mean(), scores4.std() * 2))

Specificity: 0.9668 (+/- 0.0422)
Accuracy: 0.9657 (+/- 0.0393)
Recall: 0.9640 (+/- 0.0790)
Score cost: -1710228.3500 (+/- 3780952.2720)


In [10]:
seed = 7
np.random.seed(seed)

In [11]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from keras import optimizers

Using Theano backend.


In [12]:
def create_model8():
    model8 = Sequential()
    model8.add(Dense(10, input_dim=9, activation='relu'))
    model8.add(Dense(10, activation='relu'))
    model8.add(Dense(10, activation='relu'))
    model8.add(Dense(10, activation='relu'))
    model8.add(Dense(10, activation='relu'))
    model8.add(Dense(10, activation='relu'))
    model8.add(Dense(10, activation='relu'))
    model8.add(Dense(10, activation='relu'))
    model8.add(Dense(1, activation='sigmoid'))
    adm = optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model8.compile(loss='binary_crossentropy', optimizer=adm, metrics=['accuracy'])
    return model8

In [13]:
from keras.models import load_model

In [14]:
model=create_model8()

In [None]:
model.save_weights('my_model_weights.h5')

In [15]:
from sklearn.metrics import confusion_matrix

In [17]:
import warnings
warnings.filterwarnings('ignore', message='.*is ill-defined', append=True)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [18]:
epochs = [10, 20, 50, 100, 200, 500]
class_weight = [{0 : 1., 1: 2},
                {0 : 1., 1: 4},
                {0 : 1., 1: 8},
                {0 : 1., 1: 16},
                {0 : 1., 1: 32},
                {0 : 1., 1: 64}]
results=[]
model.load_weights('my_model_weights.h5')

for epochs_idx in range(len(epochs)):
  for class_weight_idx in range(len(class_weight)):
    param1 = epochs[epochs_idx]
    param2 = class_weight[class_weight_idx]
    bestmodelNN = KerasClassifier(build_fn=create_model8, epochs= param1, class_weight = param2, verbose=0)
    model.load_weights('my_model_weights.h5')
    print("Epochs : %r & Class Weight : %r)" % (param1, param2))
    scores1=cross_val_score(bestmodelNN,X_train.as_matrix(),y_train,cv=10,scoring=score_specificity)
    print("Specificity: %0.4f (+/- %0.4f)" % (scores1.mean(), scores1.std() ))
    model.load_weights('my_model_weights.h5')
    scores2=cross_val_score(bestmodelNN,X_train.as_matrix(),y_train,cv=10,scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f)" % (scores2.mean(), scores2.std() ))
    model.load_weights('my_model_weights.h5')
    scores3=cross_val_score(bestmodelNN,X_train.as_matrix(),y_train,cv=10,scoring='recall')
    print("Recall: %0.4f (+/- %0.4f)" % (scores3.mean(), scores3.std() ))
    model.load_weights('my_model_weights.h5')
    scores4=cross_val_score(bestmodelNN,X_train.as_matrix(), y_train, cv=10, scoring=my_custom_scorer)
    print("Score cost: %0.4f (+/- %0.4f)" % (scores4.mean(), scores4.std() ))
    print()
    print()


Epochs : 10 & Class Weight : {0: 1.0, 1: 2})
Specificity: 0.9599 (+/- 0.0543)
Accuracy: 0.9595 (+/- 0.0335)
Recall: 0.9683 (+/- 0.0696)
Score cost: -1995274.0200 (+/- 1824840.5135)


Epochs : 10 & Class Weight : {0: 1.0, 1: 4})
Specificity: 0.9388 (+/- 0.0493)
Accuracy: 0.9616 (+/- 0.0354)
Recall: 0.9938 (+/- 0.0187)
Score cost: -570456.7000 (+/- 1139885.8788)


Epochs : 10 & Class Weight : {0: 1.0, 1: 8})
Specificity: 0.8860 (+/- 0.0872)
Accuracy: 0.9638 (+/- 0.0301)
Recall: 0.9895 (+/- 0.0316)
Score cost: -1140525.2050 (+/- 2279851.6433)


Epochs : 10 & Class Weight : {0: 1.0, 1: 16})
Specificity: 0.9147 (+/- 0.0528)
Accuracy: 0.9210 (+/- 0.0496)
Recall: 0.9843 (+/- 0.0346)
Score cost: -286187.4200 (+/- 854758.2331)


Epochs : 10 & Class Weight : {0: 1.0, 1: 32})
Specificity: 0.5977 (+/- 0.3704)
Accuracy: 0.9234 (+/- 0.0653)
Recall: 1.0000 (+/- 0.0000)
Score cost: -1507.1100 (+/- 2432.1151)


Epochs : 10 & Class Weight : {0: 1.0, 1: 64})
Specificity: 0.8072 (+/- 0.1226)
Accuracy: 0.7

In [19]:
best_class_weight = {0: 1.0, 1: 8}

In [20]:
bestmodelNN_test = KerasClassifier(build_fn=create_model8, epochs= 20, class_weight = best_class_weight, verbose=0)

In [21]:
import warnings
warnings.filterwarnings('ignore', message='.*is ill-defined', append=True)
warnings.filterwarnings('ignore', category=DeprecationWarning)
for i in range(10):
    model.load_weights('my_model_weights.h5')
    scores1=cross_val_score(bestmodelNN_test, X_train.as_matrix(),y_train,cv=10,scoring=score_specificity)
    print("Specificity: %0.4f (+/- %0.4f)" % (scores1.mean(), scores1.std()*2))
    model.load_weights('my_model_weights.h5')
    scores3=cross_val_score(bestmodelNN_test, X_train.as_matrix(),y_train,cv=10,scoring='recall')
    print("Recall: %0.4f (+/- %0.4f)" % (scores3.mean(), scores3.std()*2))
    print()
    print()

Specificity: 0.9419 (+/- 0.0895)
Recall: 1.0000 (+/- 0.0000)


Specificity: 0.9265 (+/- 0.1390)
Recall: 0.9952 (+/- 0.0286)


Specificity: 0.9222 (+/- 0.1428)
Recall: 1.0000 (+/- 0.0000)


Specificity: 0.9345 (+/- 0.1291)
Recall: 0.9952 (+/- 0.0286)


Specificity: 0.9302 (+/- 0.1065)
Recall: 0.9955 (+/- 0.0273)


Specificity: 0.9420 (+/- 0.0989)
Recall: 0.9905 (+/- 0.0571)


Specificity: 0.9210 (+/- 0.1050)
Recall: 0.9947 (+/- 0.0316)


Specificity: 0.9521 (+/- 0.1015)
Recall: 0.9900 (+/- 0.0402)


Specificity: 0.9172 (+/- 0.1240)
Recall: 0.9847 (+/- 0.0468)


Specificity: 0.9357 (+/- 0.1184)
Recall: 1.0000 (+/- 0.0000)


