In [20]:
import random
import time

import scipy.io
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
import numpy as np
import scipy.io
from scipy import interp
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from interpret import show
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [21]:
def balance_dt(data, label, seed=None):
    random.seed(seed)
    ones = []
    for i in range(len(label)):
        if label[i] == 1:
            ones.append(i)
    zeros = []
    for i in range(len(label)):
        if label[i] == 0:
            zeros.append(i)
    zeros = random.sample(zeros, len(ones))
    indices = zeros + ones
    X = data[indices]
    y = label[indices]
    return X, y

In [22]:
def balance_reversed(data, label, seed=None):
    random.seed(seed)
    zeros = []
    for i in range(len(label)):
        if label[i] == 0:
            zeros.append(i)
    ones = []
    for i in range(len(label)):
        if label[i] == 1:
            ones.append(i)
    ones = random.sample(ones, len(zeros))
    indices = zeros + ones
    X = data[indices]
    y = label[indices]
    return X, y


In [23]:
def eval_with_kfold(best_clf, x, y, org_dt, org_lb):
    cros_res = cross_val_score(best_clf, x, y, cv=5, scoring='accuracy')
    print("cross_res accuracy", np.mean(cros_res))
    cros_res = cross_val_score(best_clf, org_dt, org_lb, cv=5, scoring='accuracy')
    print("cross_res accuracy", np.mean(cros_res))

    cros_res = cross_val_score(best_clf, x, y, cv=5, scoring='precision')
    print("cross_res precision", np.mean(cros_res))
    cros_res = cross_val_score(best_clf, org_dt, org_lb, cv=5, scoring='precision')
    print("cross_res precision", np.mean(cros_res))

    cros_res = cross_val_score(best_clf, x, y, cv=5, scoring='recall')
    print("cross_res recall", np.mean(cros_res))
    cros_res = cross_val_score(best_clf, org_dt, org_lb, cv=5, scoring='recall')
    print("cross_res recall", np.mean(cros_res))

    cros_res = cross_val_score(best_clf, x, y, cv=5, scoring='f1')
    print("cross_res f1", np.mean(cros_res))
    cros_res = cross_val_score(best_clf, org_dt, org_lb, cv=5, scoring='f1')
    print("cross_res f1", np.mean(cros_res))

    cros_res = cross_val_score(best_clf, x, y, cv=5, scoring='roc_auc')
    print("cross_res auc", np.mean(cros_res))
    cros_res = cross_val_score(best_clf, org_dt, org_lb, cv=5, scoring='roc_auc')
    print("cross_res auc", np.mean(cros_res))

In [24]:
mat = scipy.io.loadmat('data.mat')
org_dat = mat['OriginalData']
stand_dat = mat['Scaled_Standardization']
minmax_dat = mat['Scaled_Min_Max']
label = mat['label'][0]

In [25]:
best_sc = 0
best_x = []
best_y = []
best_es = None

SVM - Balanced

In [26]:
for i in range(5):
    random.seed(i)
    X, y = balance_dt(minmax_dat, label, seed=i)

    parameters = {'kernel': ['linear', 'poly', 'rbf'], 'random_state': [i], 'C': [1, 2, 3, 4, 5]}

    clf = GridSearchCV(SVC(probability=True), parameters, n_jobs=-1, cv=5, verbose=1, scoring='f1')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [27]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

-----------------Results--------------------
Best score:  0.6049742303182392
SVC(C=5, probability=True, random_state=2)


In [28]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

cross_res accuracy 0.68297797445439
cross_res accuracy 0.7522291038345541
cross_res precision 0.7297626104702991
cross_res precision 0.755096695419276
cross_res recall 0.5321846771495483
cross_res recall 0.20327199732351958
cross_res f1 0.6049742303182392
cross_res f1 0.19436882770031502
cross_res auc 0.7546604489686226
cross_res auc 0.6229464250418264


SVM - Unbalanced

In [29]:
for i in range(5):
    random.seed(i)
    X = minmax_dat
    y = label

    parameters = {'kernel': ['linear', 'poly', 'rbf'], 'random_state': [i], 'C': [1, 2, 3, 4, 5]}

    clf = GridSearchCV(SVC(probability=True), parameters, n_jobs=-1, cv=5, verbose=1, scoring='f1')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [30]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

-----------------Results--------------------
Best score:  0.6049742303182392
SVC(C=5, probability=True, random_state=2)


In [31]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

cross_res accuracy 0.68297797445439
cross_res accuracy 0.7522291038345541
cross_res precision 0.7297626104702991
cross_res precision 0.755096695419276
cross_res recall 0.5321846771495483
cross_res recall 0.20327199732351958
cross_res f1 0.6049742303182392
cross_res f1 0.19436882770031502
cross_res auc 0.7546604489686226
cross_res auc 0.6229464250418264


Interpretation of SVM results

In [36]:
from interpret.blackbox import LimeTabular

seed = 1
X, y = balance_dt(minmax_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
svc = SVC(gamma='auto', probability=True)

blackbox_model = Pipeline([('pca', pca), ('svc', svc)])
blackbox_model.fit(X_train, y_train)

lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=X_train)
lime_local = lime.explain_local(X_test[:5], y_test[:5])

show(lime_local)

In [37]:
from interpret.blackbox import LimeTabular

seed = 1
X = minmax_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
svc = SVC(gamma='auto', probability=True)

blackbox_model = Pipeline([('pca', pca), ('svc', svc)])
blackbox_model.fit(X_train, y_train)

lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=X_train)
lime_local = lime.explain_local(X_test[:5], y_test[:5])

show(lime_local)

CNN

In [None]:
best_sc = 0
best_x = []
best_y = []
best_es = None

In [None]:
for i in range(10):
    random.seed(i)
    X, y = balance_dt(minmax_dat, label, seed=i)

    parameters = {'activation': ['relu'], 'solver': ['sgd'],
                  'learning_rate': ['constant'],
                  'hidden_layer_sizes': (90,),
                  'max_iter': [200, 500, 1000], 'random_state': [i]}

    clf = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1, cv=3, verbose=1, scoring='f1')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

In [None]:

print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

In [None]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

In [None]:
from interpret.blackbox import ShapKernel
seed = 1
X, y = balance_dt(minmax_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

pca = PCA()
cnn = MLPClassifier(random_state = seed, max_iter=200)

blackbox_model = Pipeline([('pca', pca), ('cnn', cnn)])
blackbox_model.fit(X_train, y_train)

shap = ShapKernel(predict_fn=blackbox_model.predict_proba, data=X_train)
shap_local = shap.explain_local(X_test[:5], y_test[:5])

show(shap_local)

Decision Tree

In [None]:
best_sc = 0
best_x = []
best_y = []
best_es = None
initial_start_time = time.time()

In [None]:
for i in range(10):
    random.seed(i)
    X, y = balance_dt(minmax_dat, label, seed=i)

    parameters = {'criterion': ['gini', 'entropy'],
                  'min_samples_leaf': np.arange(1, 10),
                  'min_samples_split': np.arange(2, 10), 'random_state': [i], }

    clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=-1, cv=3, verbose=1, scoring='f1')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y


In [None]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

In [None]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

In [None]:
from interpret.glassbox import ClassificationTree

seed = 1
X, y = balance_dt(minmax_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

dt = ClassificationTree(random_state=seed)
dt.fit(X_train, y_train)

dt_global = dt.explain_global()

show(dt_global)

In [None]:
from interpret.glassbox import ClassificationTree

seed = 1
X = minmax_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

dt = ClassificationTree(random_state=seed)
dt.fit(X_train, y_train)

dt_global = dt.explain_global()

show(dt_global)

KNN

In [None]:
best_sc = 0
best_x = []
best_y = []
best_es = None

In [None]:
for i in range(10):
    random.seed(i)
    X, y = balance_dt(minmax_dat, label, seed=i)

    parameters = {'n_neighbors': np.arange(1, 11), 'p': [1, 2]}

    clf = GridSearchCV(KNeighborsClassifier(), parameters, n_jobs=-1, cv=3, verbose=1, scoring='recall')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

In [None]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

In [None]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

Logistic Regression

In [None]:
best_sc = 0
best_x = []
best_y = []
best_es = None

In [None]:
for i in range(10):
    random.seed(i)
    X, y = balance_dt(minmax_dat, label, seed=i)

    parameters = {'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'],
                  'random_state': [i], 'max_iter': [100, 300, 500, 1000]}

    clf = GridSearchCV(LogisticRegression(), parameters, n_jobs=-1, cv=10, verbose=1, scoring='recall')
    clf.fit(X, y)

    if clf.best_score_ > best_sc:
        best_sc = clf.best_score_
        best_es = clf.best_estimator_
        best_x = X
        best_y = y

In [None]:
print("-----------------Results--------------------")
print("Best score: ", best_sc)
print(best_es)

In [None]:
eval_with_kfold(best_es, best_x, best_y, minmax_dat, label)

In [None]:
from interpret.glassbox import LogisticRegression

seed = 1
X, y = balance_dt(minmax_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

lr = LogisticRegression(random_state = seed)
lr.fit(X_train, y_train)

lr_global = lr.explain_global()
show(lr_global)

In [None]:
from interpret.glassbox import LogisticRegression

seed = 1
X = minmax_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

lr = LogisticRegression(random_state = seed)
lr.fit(X_train, y_train)

lr_global = lr.explain_global()
show(lr_global)

In [None]:
from interpret.glassbox import ExplainableBoostingClassifier

seed = 1
X, y = balance_dt(minmax_dat, label, seed = seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

ebm = ExplainableBoostingClassifier(random_state=seed)
ebm.fit(X_train, y_train)

ebm_global = ebm.explain_global()
show(ebm_global)

In [None]:
from interpret.glassbox import ExplainableBoostingClassifier

seed = 1
X = minmax_dat
y = label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = seed)

ebm = ExplainableBoostingClassifier(random_state=seed)
ebm.fit(X_train, y_train)

ebm_global = ebm.explain_global()
show(ebm_global)