In [3]:
import random
import time
import scipy.io
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import numpy as np
import scipy.io
from scipy import interp
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from interpret import show
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [4]:
def balance_dt(data, label, seed=None):
    random.seed(seed)
    ones = []
    for i in range(len(label)):
        if label[i] == 1:
            ones.append(i)
    zeros = []
    for i in range(len(label)):
        if label[i] == 0:
            zeros.append(i)
    zeros = random.sample(zeros, len(ones))
    indices = zeros + ones
    X = data[indices]
    y = label[indices]
    return X, y

In [5]:
def balance_reversed(data, label, seed=None):
    random.seed(seed)
    zeros = []
    for i in range(len(label)):
        if label[i] == 0:
            zeros.append(i)
    ones = []
    for i in range(len(label)):
        if label[i] == 1:
            ones.append(i)
    ones = random.sample(ones, len(zeros))
    indices = zeros + ones
    X = data[indices]
    y = label[indices]
    return X, y


In [6]:
def eval_with_kfold(best_clf, x, y, org_dt, org_lb):
    cros_res = cross_val_score(best_clf, x, y, cv=10, scoring='accuracy')
    print("balanced accuracy", np.mean(cros_res))
    cros_res = cross_val_score(best_clf, org_dt, org_lb, cv=10, scoring='accuracy')
    print("unbalanced accuracy", np.mean(cros_res))

    cros_res = cross_val_score(best_clf, x, y, cv=10, scoring='precision')
    print("balanced precision", np.mean(cros_res))
    cros_res = cross_val_score(best_clf, org_dt, org_lb, cv=10, scoring='precision')
    print("unbalanced precision", np.mean(cros_res))

    cros_res = cross_val_score(best_clf, x, y, cv=10, scoring='recall')
    print("balanced recall", np.mean(cros_res))
    cros_res = cross_val_score(best_clf, org_dt, org_lb, cv=10, scoring='recall')
    print("unbalanced recall", np.mean(cros_res))

    cros_res = cross_val_score(best_clf, x, y, cv=10, scoring='f1')
    print("balanced f1", np.mean(cros_res))
    cros_res = cross_val_score(best_clf, org_dt, org_lb, cv=10, scoring='f1')
    print("unbalanced f1", np.mean(cros_res))

    cros_res = cross_val_score(best_clf, x, y, cv=10, scoring='roc_auc')
    print("balanced auc", np.mean(cros_res))
    cros_res = cross_val_score(best_clf, org_dt, org_lb, cv=10, scoring='roc_auc')
    print("unbalanced auc", np.mean(cros_res))

In [7]:
mat = scipy.io.loadmat('data.mat')
org_dat = mat['OriginalData']
stand_dat = mat['Scaled_Standardization']
minmax_dat = mat['Scaled_Min_Max']
label = mat['label'][0]

SVM With Normalized Data

In [None]:
best_sc = 0
best_x = []
best_y = []
best_es = None

In [None]:
for i in range(5):
    random.seed(i)
    X, y = balance_dt(minmax_dat, label, seed=i)

    parameters = {'kernel': ['linear', 'poly', 'rbf'], 'random_state': [i], 'C': [1, 2, 3, 4, 5]}

    clf = GridSearchCV(SVC(probability=True), parameters, n_jobs=-1, cv=10, verbose=1, scoring='f1')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

In [None]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

In [None]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

Interpretation of SVM Results Using LIME

Balanced

In [None]:
from interpret.blackbox import LimeTabular

seed = 1
X, y = balance_dt(minmax_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
svc = SVC(gamma='auto', probability=True)

blackbox_model = Pipeline([('pca', pca), ('svc', svc)])
blackbox_model.fit(X_train, y_train)

lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=X_train)
lime_local = lime.explain_local(X_test[:5], y_test[:5])

show(lime_local)

Unbalanced

In [None]:
from interpret.blackbox import LimeTabular

seed = 1
X = minmax_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
svc = SVC(gamma='auto', probability=True)

blackbox_model = Pipeline([('pca', pca), ('svc', svc)])
blackbox_model.fit(X_train, y_train)

lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=X_train)
lime_local = lime.explain_local(X_test[:5], y_test[:5])

show(lime_local)

SVM With Not Normalized Data

In [None]:
# best_sc = 0
# best_x = []
# best_y = []
# best_es = None

In [None]:
# for i in range(5):
#     random.seed(i)
#     X, y = balance_dt(org_dat, label, seed=i)

#     parameters = {'kernel': ['linear', 'poly', 'rbf'], 'random_state': [i], 'C': [1, 2, 3, 4, 5]}

#     clf = GridSearchCV(SVC(probability=True), parameters, n_jobs=-1, cv=10, verbose=1, scoring='f1')
#     clf.fit(X, y)

#     if clf.best_score_ > best_sc:
#         best_sc = clf.best_score_
#         best_es = clf.best_estimator_
#         best_x = X
#         best_y = y

In [None]:
# print("-----------------Results--------------------")
# print("Best score: ", best_sc)
# print(best_es)

In [None]:
# eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

Interpretation of SVM Results Using LIME

Balanced

In [None]:
from interpret.blackbox import LimeTabular

seed = 1
X, y = balance_dt(org_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
svc = SVC(gamma='auto', probability=True)

blackbox_model = Pipeline([('pca', pca), ('svc', svc)])
blackbox_model.fit(X_train, y_train)

lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=X_train)
lime_local = lime.explain_local(X_test[:5], y_test[:5])

show(lime_local)

Unbalanced

In [None]:
from interpret.blackbox import LimeTabular

seed = 1
X = org_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
svc = SVC(gamma='auto', probability=True)

blackbox_model = Pipeline([('pca', pca), ('svc', svc)])
blackbox_model.fit(X_train, y_train)

lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=X_train)
lime_local = lime.explain_local(X_test[:5], y_test[:5])

show(lime_local)

CNN With Normalized Data

In [8]:
best_sc = 0
best_x = []
best_y = []
best_es = None

In [None]:
for i in range(5):
    random.seed(i)
    X, y = balance_dt(minmax_dat, label, seed=i)

    parameters = {'activation': ['relu'], 'solver': ['sgd'],
                  'learning_rate': ['constant'],
                  'hidden_layer_sizes': (90,),
                  'max_iter': [200, 500, 1000], 'random_state': [i]}

    clf = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1, cv=10, verbose=1, scoring='f1')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

In [None]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

In [None]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

Interpretation of CNN Results Using Kernel SHAP

Balanced

In [None]:
from interpret.blackbox import ShapKernel
seed = 1
X, y = balance_dt(minmax_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
cnn = MLPClassifier(random_state = seed, max_iter=200)

blackbox_model = Pipeline([('pca', pca), ('cnn', cnn)])
blackbox_model.fit(X_train, y_train)

shap = ShapKernel(predict_fn=blackbox_model.predict_proba, data=X_train)
shap_local = shap.explain_local(X_test[:5], y_test[:5])

show(shap_local)

Unbalanced

In [None]:
from interpret.blackbox import ShapKernel
seed = 1
X = minmax_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
cnn = MLPClassifier(random_state = seed, max_iter=200)

blackbox_model = Pipeline([('pca', pca), ('cnn', cnn)])
blackbox_model.fit(X_train, y_train)

shap = ShapKernel(predict_fn=blackbox_model.predict_proba, data=X_train)
shap_local = shap.explain_local(X_test[:5], y_test[:5])

show(shap_local)

CNN With Not Normalized Data

In [9]:
best_sc = 0
best_x = []
best_y = []
best_es = None

In [10]:
for i in range(5):
    random.seed(i)
    X, y = balance_dt(org_dat, label, seed=i)

    parameters = {'activation': ['relu'], 'solver': ['sgd'],
                  'learning_rate': ['constant'],
                  'hidden_layer_sizes': (90,),
                  'max_iter': [200, 500, 1000], 'random_state': [i]}

    clf = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1, cv=10, verbose=1, scoring='f1')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

Fitting 10 folds for each of 3 candidates, totalling 30 fits
Fitting 10 folds for each of 3 candidates, totalling 30 fits
Fitting 10 folds for each of 3 candidates, totalling 30 fits
Fitting 10 folds for each of 3 candidates, totalling 30 fits
Fitting 10 folds for each of 3 candidates, totalling 30 fits


In [11]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

-----------------Results--------------------
Best score:  0.6696088108227871
MLPClassifier(hidden_layer_sizes=90, random_state=0, solver='sgd')


In [12]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

balanced accuracy 0.5073737035797926
unbalanced accuracy 0.7554485797251453
balanced precision 0.5037416068822991


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


unbalanced precision 0.021311475409836064
balanced recall 0.9983606557377049
unbalanced recall 0.021311475409836064
balanced f1 0.6696088108227871
unbalanced f1 0.021311475409836064
balanced auc 0.5049314700349369
unbalanced auc 0.798888091664751


Interpretation of CNN Results Using Kernel SHAP

Balanced

In [None]:
from interpret.blackbox import ShapKernel
seed = 1
X, y = balance_dt(org_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
cnn = MLPClassifier(random_state = seed, max_iter=200)

blackbox_model = Pipeline([('pca', pca), ('cnn', cnn)])
blackbox_model.fit(X_train, y_train)

shap = ShapKernel(predict_fn=blackbox_model.predict_proba, data=X_train)
shap_local = shap.explain_local(X_test[:5], y_test[:5])

show(shap_local)

Unbalanced

In [None]:
from interpret.blackbox import ShapKernel
seed = 1
X = org_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
cnn = MLPClassifier(random_state = seed, max_iter=200)

blackbox_model = Pipeline([('pca', pca), ('cnn', cnn)])
blackbox_model.fit(X_train, y_train)

shap = ShapKernel(predict_fn=blackbox_model.predict_proba, data=X_train)
shap_local = shap.explain_local(X_test[:5], y_test[:5])

show(shap_local)

Decision Tree With Normalized Data

In [None]:
best_sc = 0
best_x = []
best_y = []
best_es = None
initial_start_time = time.time()

In [None]:
for i in range(5):
    random.seed(i)
    X, y = balance_dt(minmax_dat, label, seed=i)

    parameters = {'criterion': ['gini', 'entropy'],
                  'min_samples_leaf': np.arange(1, 10),
                  'min_samples_split': np.arange(2, 10), 'random_state': [i], }

    clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=-1, cv=10, verbose=1, scoring='f1')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

In [None]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

In [None]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

Interpreting Decision Tree Using Glassbox Method

Balanced

In [None]:
from interpret.glassbox import ClassificationTree

seed = 1
X, y = balance_dt(minmax_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

dt = ClassificationTree(random_state=seed)
dt.fit(X_train, y_train)

dt_global = dt.explain_global()

show(dt_global)

Unbalanced

In [None]:
from interpret.glassbox import ClassificationTree

seed = 1
X = minmax_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

dt = ClassificationTree(random_state=seed)
dt.fit(X_train, y_train)

dt_global = dt.explain_global()

show(dt_global)

Decision Tree With Not Normalized Data

In [13]:
best_sc = 0
best_x = []
best_y = []
best_es = None
initial_start_time = time.time()

In [14]:
for i in range(5):
    random.seed(i)
    X, y = balance_dt(org_dat, label, seed=i)

    parameters = {'criterion': ['gini', 'entropy'],
                  'min_samples_leaf': np.arange(1, 10),
                  'min_samples_split': np.arange(2, 10), 'random_state': [i], }

    clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=-1, cv=10, verbose=1, scoring='f1')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits


In [15]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

-----------------Results--------------------
Best score:  0.6257608649687237
DecisionTreeClassifier(min_samples_leaf=9, random_state=4)


In [16]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

balanced accuracy 0.6506490465038474
unbalanced accuracy 0.7198457535506871
balanced precision 0.6481940997052794
unbalanced precision 0.4931306683568743
balanced recall 0.6140677062508331
unbalanced recall 0.3646208183393309
balanced f1 0.6257608649687237
unbalanced f1 0.3536949511516238
balanced auc 0.6871947395905027
unbalanced auc 0.6460849106808653


Interpreting Decision Tree Using Glassbox Method

Balanced

In [None]:
from interpret.glassbox import ClassificationTree

seed = 1
X, y = balance_dt(org_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

dt = ClassificationTree(random_state=seed)
dt.fit(X_train, y_train)

dt_global = dt.explain_global()

show(dt_global)

Unbalanced

In [None]:
from interpret.glassbox import ClassificationTree

seed = 1
X = org_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

dt = ClassificationTree(random_state=seed)
dt.fit(X_train, y_train)

dt_global = dt.explain_global()

show(dt_global)

Logistic Regression With Normalized Data

In [None]:
best_sc = 0
best_x = []
best_y = []
best_es = None

In [None]:
for i in range(5):
    random.seed(i)
    X, y = balance_dt(minmax_dat, label, seed=i)

    parameters = {'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'],
                  'random_state': [i], 'max_iter': [100, 300, 500, 1000]}

    clf = GridSearchCV(LogisticRegression(), parameters, n_jobs=-1, cv=10, verbose=1, scoring='recall')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

In [None]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

In [None]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

Interpreting Logistic Regression Using Glassbox Method

Balanced

In [None]:
from interpret.glassbox import LogisticRegression

seed = 1
X, y = balance_dt(minmax_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

lr = LogisticRegression(random_state = seed)
lr.fit(X_train, y_train)

lr_global = lr.explain_global()
show(lr_global)

Unbalanced

In [None]:
from interpret.glassbox import LogisticRegression

seed = 1
X = minmax_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

lr = LogisticRegression(random_state = seed)
lr.fit(X_train, y_train)

lr_global = lr.explain_global()
show(lr_global)

Logistic Regression With Not Normalized Data

In [17]:
best_sc = 0
best_x = []
best_y = []
best_es = None

In [18]:
for i in range(5):
    random.seed(i)
    X, y = balance_dt(org_dat, label, seed=i)

    parameters = {'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'],
                  'random_state': [i], 'max_iter': [100, 300, 500, 1000]}

    clf = GridSearchCV(LogisticRegression(), parameters, n_jobs=-1, cv=10, verbose=1, scoring='recall')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

Fitting 10 folds for each of 20 candidates, totalling 200 fits




Fitting 10 folds for each of 20 candidates, totalling 200 fits




Fitting 10 folds for each of 20 candidates, totalling 200 fits




Fitting 10 folds for each of 20 candidates, totalling 200 fits




Fitting 10 folds for each of 20 candidates, totalling 200 fits




In [19]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

-----------------Results--------------------
Best score:  0.9844262295081968
LogisticRegression(random_state=3, solver='saga')


In [20]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)



balanced accuracy 0.5102559384409502
unbalanced accuracy 0.7514713069978725


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


balanced precision 0.5050745643395572
unbalanced precision 0.3779363060048535


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


balanced recall 0.9844262295081968
unbalanced recall 0.10901639344262293




balanced f1 0.6675463588563378
unbalanced f1 0.13581061277717327




balanced auc 0.7233020999250572
unbalanced auc 0.7858432607414777


Interpreting Logistic Regression Using Glassbox Method

Balanced

In [None]:
from interpret.glassbox import LogisticRegression

seed = 1
X, y = balance_dt(org_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

lr = LogisticRegression(random_state = seed)
lr.fit(X_train, y_train)

lr_global = lr.explain_global()
show(lr_global)

Unbalanced

In [None]:
from interpret.glassbox import LogisticRegression

seed = 1
X = org_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

lr = LogisticRegression(random_state = seed)
lr.fit(X_train, y_train)

lr_global = lr.explain_global()
show(lr_global)

KNN With Normalized Data

In [None]:
best_sc = 0
best_x = []
best_y = []
best_es = None

In [None]:
for i in range(5):
    random.seed(i)
    X, y = balance_dt(minmax_dat, label, seed=i)

    parameters = {'n_neighbors': np.arange(1, 11), 'p': [1, 2]}

    clf = GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=-1, cv=10, verbose=1, scoring='recall')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

In [None]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

In [None]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

Interpreting KNN Using Partial Dependence Plot

Balanced

In [None]:
from interpret.blackbox import PartialDependence
seed = 1
X, y = balance_dt(minmax_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
knn = KNeighborsClassifier()

blackbox_model = Pipeline([('pca', pca), ('knn', knn)])
blackbox_model.fit(X_train, y_train)

pdp = PartialDependence(predict_fn=blackbox_model.predict_proba, data=X_train)
pdp_global = pdp.explain_global()

show(pdp_global)

Unbalanced

In [None]:
from interpret.blackbox import PartialDependence
seed = 1
X = minmax_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
knn = KNeighborsClassifier()

blackbox_model = Pipeline([('pca', pca), ('knn', knn)])
blackbox_model.fit(X_train, y_train)

pdp = PartialDependence(predict_fn=blackbox_model.predict_proba, data=X_train)
pdp_global = pdp.explain_global()

show(pdp_global)

KNN With Not Normalized Data

In [21]:
best_sc = 0
best_x = []
best_y = []
best_es = None

In [22]:
for i in range(5):
    random.seed(i)
    X, y = balance_dt(org_dat, label, seed=i)

    parameters = {'n_neighbors': np.arange(1, 11), 'p': [1, 2]}

    clf = GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=-1, cv=10, verbose=1, scoring='recall')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

Fitting 10 folds for each of 20 candidates, totalling 200 fits
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Fitting 10 folds for each of 20 candidates, totalling 200 fits


In [None]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

In [23]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

balanced accuracy 0.6723603211776513
unbalanced accuracy 0.7467371772756024
balanced precision 0.6605113150119803
unbalanced precision 0.6039004449628574
balanced recall 0.6484606157536985
unbalanced recall 0.30399840063974415
balanced f1 0.6467209275755886
unbalanced f1 0.3197886957772048
balanced auc 0.719744397323038
unbalanced auc 0.6906640722859024


Interpreting KNN Using Partial Dependence Plot

Balanced

In [None]:
from interpret.blackbox import PartialDependence
seed = 1
X, y = balance_dt(org_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
knn = KNeighborsClassifier()

blackbox_model = Pipeline([('pca', pca), ('knn', knn)])
blackbox_model.fit(X_train, y_train)

pdp = PartialDependence(predict_fn=blackbox_model.predict_proba, data=X_train)
pdp_global = pdp.explain_global()

show(pdp_global)

Unbalanced

In [None]:
from interpret.blackbox import PartialDependence
seed = 1
X = org_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
knn = KNeighborsClassifier()

blackbox_model = Pipeline([('pca', pca), ('knn', knn)])
blackbox_model.fit(X_train, y_train)

pdp = PartialDependence(predict_fn=blackbox_model.predict_proba, data=X_train)
pdp_global = pdp.explain_global()

show(pdp_global)