In [None]:
import pandas as pd
from estimator import *
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef as MCC
from sklearn.metrics import cohen_kappa_score

In [15]:
def check_fitted(clf): 
    return hasattr(clf, "classes_")

In [16]:
def sensitivity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp+fn)
    return sensitivity

def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn+fp)
    return specificity

scoring = {'rocauc': make_scorer(roc_auc_score),
           'accuracy': make_scorer(accuracy_score),
           'sensitivity': make_scorer(sensitivity), 
           'specificity': make_scorer(specificity),
           'mcc': make_scorer(MCC),
           'kappa': make_scorer(cohen_kappa_score)}

def print_results(cv_results, classifier):
    print(classifier)
    print('Sensitivity: ', "%.2f" % cv_results['test_sensitivity'].mean())
    print('Specificity: ', "%.2f" % cv_results['test_specificity'].mean())
    print('Accuracy: ', "%.2f" % cv_results['test_accuracy'].mean())  
    print('MCC: ', "%.2f" % cv_results['test_mcc'].mean())
    print('ROC: ', "%.2f" % cv_results['test_rocauc'].mean())  
    print('Cohen\'s Kappa: ', "%.2f" % cv_results['test_kappa'].mean())


def generate_results(clf, features, labels, fold_size=5):
    cv_results = cross_validate(clf.fit(features, labels), features, labels, scoring=scoring, cv=fold_size)
    print_results(cv_results, clf)

In [17]:
def print_results_test(y_true, y_pred, y_scores, classifier):
    print(classifier)
    print('Sensitivity: ', "%.2f" % sensitivity(y_true, y_pred))
    print('Specificity: ', "%.2f" % specificity(y_true, y_pred))
    print('Accuracy: ', "%.2f" % accuracy_score(y_true, y_pred))  
    print('MCC: ', "%.2f" % MCC(y_true, y_pred))
    print('ROC: ', "%.2f" % roc_auc_score(y_true, y_scores))
    print('Cohen\'s Kappa: ', "%.2f" % cohen_kappa_score(y_true, y_pred))

In [18]:
def evaluate_models_cv(clf1, clf2, clf3, clf4, clf5, X_train, Y_train):
    generate_results(clf1, X_train, Y_train)
    generate_results(clf2, X_train, Y_train)
    generate_results(clf3, X_train, Y_train)
    generate_results(clf4, X_train, Y_train)
    generate_results(clf5, X_train, Y_train)

def evaluate_models_test(clf1, clf2, clf3, clf4, clf5, X_valid, Y_valid):

    Y_valid_scores = clf1.predict_proba(X_valid)[:, 1]
    Y_valid_pred = clf1.predict(X_valid)
    print_results_test(Y_valid, Y_valid_pred, Y_valid_scores, clf1)

    Y_valid_scores = clf2.predict_proba(X_valid)[:, 1]
    Y_valid_pred = clf2.predict(X_valid)
    print_results_test(Y_valid, Y_valid_pred, Y_valid_scores, clf2)

    Y_valid_scores = clf3.predict_proba(X_valid)[:, 1]
    Y_valid_pred = clf3.predict(X_valid)
    print_results_test(Y_valid, Y_valid_pred, Y_valid_scores, clf3)

    Y_valid_scores = clf4.predict_proba(X_valid)[:, 1]
    Y_valid_pred = clf4.predict(X_valid)
    print_results_test(Y_valid, Y_valid_pred, Y_valid_scores, clf4)

    Y_valid_scores = clf5.predict_proba(X_valid)[:, 1]
    Y_valid_pred = clf5.predict(X_valid)
    print_results_test(Y_valid, Y_valid_pred, Y_valid_scores, clf5)

In [19]:
def hyper_parameter_tuning(X_train, Y_train, csvname):
    models1 = models1 = { 
        'SVC': SVC()
    }
    params1 = {
        'SVC': [
            {'kernel': ['linear'], 'C': [1,2,3,4,5,6,7,8,9,10]},
            {'kernel': ['rbf'], 'C': [1,2,3,4,5,10], 'gamma': [1,0.5,0.01,0.001, 0.0001]}
        ]
    }
    helper1 = EstimatorSelectionHelper(models1, params1)
    helper1.fit(X_train, Y_train, cv=5, scoring='roc_auc', n_jobs=-1)
    helper1.score_summary(sort_by='mean_score').to_csv(csvname, sep=',')

In [20]:
# pos_train0 = pd.read_csv('../dataset/pos_train', header=None)
# neg_train0 = pd.read_csv('../dataset/neg_train', header=None)
# pos_valid0 = pd.read_csv('../dataset/pos_valid', header=None)
# neg_valid0 = pd.read_csv('../dataset/neg_valid', header=None)

In [3]:
neg_train_c5 = pd.read_csv('../features/bp/nt/neg_train_n5', header=None)
neg_train_c10 = pd.read_csv('../features/bp/nt/neg_train_n10', header=None)
neg_train_c15 = pd.read_csv('../features/bp/nt/neg_train_n15', header=None)

In [4]:
neg_valid_c5 = pd.read_csv('../features/bp/nt/neg_valid_n5', header=None)
neg_valid_c10 = pd.read_csv('../features/bp/nt/neg_valid_n10', header=None)
neg_valid_c15 = pd.read_csv('../features/bp/nt/neg_valid_n15', header=None)

pos_train_c5 = pd.read_csv('../features/bp/nt/pos_train_n5', header=None)
pos_train_c10 = pd.read_csv('../features/bp/nt/pos_train_n10', header=None)
pos_train_c15 = pd.read_csv('../features/bp/nt/pos_train_n15', header=None, names = list(range(15*20+1)))

pos_valid_c5 = pd.read_csv('../features/bp/nt/pos_valid_n5', header=None)
pos_valid_c10 = pd.read_csv('../features/bp/nt/pos_valid_n10', header=None)
pos_valid_c15 = pd.read_csv('../features/bp/nt/pos_valid_n15', header=None)

In [8]:
neg_train_c5.drop(neg_train_c5.columns[len(neg_train_c5.columns)-1], axis=1, inplace=True)
neg_train_c10.drop(neg_train_c10.columns[len(neg_train_c10.columns)-1], axis=1, inplace=True)
neg_train_c15.drop(neg_train_c15.columns[len(neg_train_c15.columns)-1], axis=1, inplace=True)


neg_valid_c5.drop(neg_valid_c5.columns[len(neg_valid_c5.columns)-1], axis=1, inplace=True)
neg_valid_c10.drop(neg_valid_c10.columns[len(neg_valid_c10.columns)-1], axis=1, inplace=True)
neg_valid_c15.drop(neg_valid_c15.columns[len(neg_valid_c15.columns)-1], axis=1, inplace=True)

pos_train_c5.drop(pos_train_c5.columns[len(pos_train_c5.columns)-1], axis=1, inplace=True)
pos_train_c10.drop(pos_train_c10.columns[len(pos_train_c10.columns)-1], axis=1, inplace=True)
pos_train_c15.drop(pos_train_c15.columns[len(pos_train_c15.columns)-1], axis=1, inplace=True)

pos_valid_c5.drop(pos_valid_c5.columns[len(pos_valid_c5.columns)-1], axis=1, inplace=True)
pos_valid_c10.drop(pos_valid_c10.columns[len(pos_valid_c10.columns)-1], axis=1, inplace=True)
pos_valid_c15.drop(pos_valid_c15.columns[len(pos_valid_c15.columns)-1], axis=1, inplace=True)

In [9]:
neg_train_c5.dropna(inplace=True)
neg_train_c10.dropna(inplace=True)
neg_train_c15.dropna(inplace=True)


neg_valid_c5.dropna(inplace=True)
neg_valid_c10.dropna(inplace=True)
neg_valid_c15.dropna(inplace=True)

pos_train_c5.dropna(inplace=True)
pos_train_c10.dropna(inplace=True)
pos_train_c15.dropna(inplace=True)

pos_valid_c5.dropna(inplace=True)
pos_valid_c10.dropna(inplace=True)
pos_valid_c15.dropna(inplace=True)

In [10]:
neg_train_c5.reset_index(drop=True, inplace=True)
neg_train_c10.reset_index(drop=True, inplace=True)
neg_train_c15.reset_index(drop=True, inplace=True)


neg_valid_c5.reset_index(drop=True, inplace=True)
neg_valid_c10.reset_index(drop=True, inplace=True)
neg_valid_c15.reset_index(drop=True, inplace=True)

pos_train_c5.reset_index(drop=True, inplace=True)
pos_train_c10.reset_index(drop=True, inplace=True)
pos_train_c15.reset_index(drop=True, inplace=True)

pos_valid_c5.reset_index(drop=True, inplace=True)
pos_valid_c10.reset_index(drop=True, inplace=True)
pos_valid_c15.reset_index(drop=True, inplace=True)

In [11]:
neg_train_c5['flag'] = 0
neg_train_c10['flag'] = 0
neg_train_c15['flag'] = 0


neg_valid_c5['flag'] = 0
neg_valid_c10['flag'] = 0
neg_valid_c15['flag'] = 0

pos_train_c5['flag'] = 1
pos_train_c10['flag'] = 1
pos_train_c15['flag'] = 1

pos_valid_c5['flag'] = 1
pos_valid_c10['flag'] = 1
pos_valid_c15['flag'] = 1

In [12]:
train_c5 = pd.concat([pos_train_c5, neg_train_c5])
train_c5 = train_c5.sample(frac=1).reset_index(drop=True)
X_c5 = train_c5.drop(train_c5.columns[len(train_c5.columns)-1], axis=1)
Y_c5 = train_c5['flag']

train_c10 = pd.concat([pos_train_c10, neg_train_c10])
train_c10 = train_c10.sample(frac=1).reset_index(drop=True)
X_c10 = train_c10.drop(train_c10.columns[len(train_c10.columns)-1], axis=1)
Y_c10 = train_c10['flag']

train_c15 = pd.concat([pos_train_c15, neg_train_c15])
train_c15 = train_c15.sample(frac=1).reset_index(drop=True)
X_c15 = train_c15.drop(train_c15.columns[len(train_c15.columns)-1], axis=1)
Y_c15 = train_c15['flag']

In [13]:
valid_c5 = pd.concat([pos_valid_c5, neg_valid_c5])
valid_c5 = valid_c5.sample(frac=1).reset_index(drop=True)
Xval_c5 = valid_c5.drop(valid_c5.columns[len(valid_c5.columns)-1], axis=1)
Yval_c5 = valid_c5['flag']

valid_c10 = pd.concat([pos_valid_c10, neg_valid_c10])
valid_c10 = valid_c10.sample(frac=1).reset_index(drop=True)
Xval_c10 = valid_c10.drop(valid_c10.columns[len(valid_c10.columns)-1], axis=1)
Yval_c10 = valid_c10['flag']

valid_c15 = pd.concat([pos_valid_c15, neg_valid_c15])
valid_c15 = valid_c15.sample(frac=1).reset_index(drop=True)
Xval_c15 = valid_c15.drop(valid_c15.columns[len(valid_c15.columns)-1], axis=1)
Yval_c15 = valid_c15['flag']

In [20]:
from estimator import *
helper1 = EstimatorSelectionHelper(models1, params1)

In [21]:
helper1.fit(X_c5, Y_c5, scoring='auc_roc', n_jobs=-1)

Running GridSearchCV for SVC.
Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  2.0min finished


In [None]:
helper1.fit(X_c10, Y_c10, scoring='accuracy', n_jobs=-1)
helper1.score_summary(sort_by='mean_score').to_csv("parameters_bpp-nt-10.csv", sep=',')

In [30]:
helper1.fit(X_c15, Y_c15, scoring='accuracy', n_jobs=-1)
helper1.score_summary(sort_by='mean_score').to_csv("parameters_bpp-nt-15.csv", sep=',')

Running GridSearchCV for SVC.
Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.5min


SVC


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  4.4min finished
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)


In [22]:
helper1.score_summary(sort_by='mean_score').to_csv("parameters_bpp-nt-5.csv", sep=',')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)


SVC


In [29]:
clf1 = SVC(C=2, gamma=0.5, kernel='rbf')
clf2 = SVC(C=10, gamma=0.01, kernel='rbf')

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef as MCC
from sklearn.metrics import cohen_kappa_score

def sensitivity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp+fn)
    return sensitivity

def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn+fp)
    return specificity


scoring = {'rocauc': make_scorer(roc_auc_score),
           'accuracy': make_scorer(accuracy_score),
           'sensitivity': make_scorer(sensitivity), 
           'specificity': make_scorer(specificity),
           'mcc': make_scorer(MCC),
           'kappa': make_scorer(cohen_kappa_score)}

In [25]:
def print_results(cv_results, classifier):
    print(classifier)
    print('Sensitivity: ', "%.2f" % cv_results['test_sensitivity'].mean())
    print('Specificity: ', "%.2f" % cv_results['test_specificity'].mean())
    print('Accuracy: ', "%.2f" % cv_results['test_accuracy'].mean())  
    print('MCC: ', "%.2f" % cv_results['test_mcc'].mean())
    print('ROC: ', "%.2f" % cv_results['test_rocauc'].mean())  
    print('Cohen\'s Kappa: ', "%.2f" % cv_results['test_kappa'].mean())


def generate_results(clf, features, labels, classifier, fold_size=5):
    cv_results = cross_validate(clf.fit(features, labels), features, labels, scoring=scoring, cv=fold_size)
    print_results(cv_results, classifier)

In [33]:
generate_results(clf1, X_c5, Y_c5, '(C=2, gamma=0.5, kernel=\'rbf\')')
generate_results(clf2, X_c10, Y_c10, '(C=10, gamma=0.01, kernel=\'rbf\')')
generate_results(clf2, X_c15, Y_c15, '(C=2, gamma=0.5, kernel=\'rbf\')')

(C=2, gamma=0.5, kernel='rbf')
Sensitivity:  0.66
Specificity:  0.73
Accuracy:  0.70
MCC:  0.39
ROC:  0.70
Cohen's Kappa:  0.39
(C=10, gamma=0.01, kernel='rbf')
Sensitivity:  0.72
Specificity:  0.66
Accuracy:  0.69
MCC:  0.39
ROC:  0.69
Cohen's Kappa:  0.39
(C=2, gamma=0.5, kernel='rbf')
Sensitivity:  0.58
Specificity:  0.70
Accuracy:  0.65
MCC:  0.28
ROC:  0.64
Cohen's Kappa:  0.28


In [34]:
generate_results(clf1, Xval_c5, Yval_c5, '(C=2, gamma=0.5, kernel=\'rbf\')')
generate_results(clf2, Xval_c10, Yval_c10, '(C=10, gamma=0.01, kernel=\'rbf\')')
generate_results(clf2, Xval_c15, Yval_c15, '(C=10, gamma=0.01, kernel=\'rbf\')')

(C=2, gamma=0.5, kernel='rbf')
Sensitivity:  0.58
Specificity:  0.69
Accuracy:  0.64
MCC:  0.28
ROC:  0.64
Cohen's Kappa:  0.28
(C=10, gamma=0.01, kernel='rbf')
Sensitivity:  0.63
Specificity:  0.49
Accuracy:  0.56
MCC:  0.12
ROC:  0.56
Cohen's Kappa:  0.12
(C=10, gamma=0.01, kernel='rbf')
Sensitivity:  0.56
Specificity:  0.59
Accuracy:  0.57
MCC:  0.15
ROC:  0.57
Cohen's Kappa:  0.15
