# Load Data and Libraries


In [4]:
# load libraries
import os
import gc
import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import metrics, random forest, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix, classification_report, recall_score,precision_score, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, PrecisionRecallDisplay, auc, precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


In [3]:
# load data
data = pd.read_csv(os.getcwd() + "\\data\\processed\\features_final_merge.csv")

## Feature Importance on default RF classifier

In [None]:
subset = "SCBC"
seed = 236
# feature_list = list(importance_df.head(5).index)

# sample from the data and undersample the negative class by random shuffling
pos_df = data[data["db"] == 1]
neg_df = data[data["db"] == 0]
neg_df = neg_df.sample(n=pos_df.shape[0], random_state=seed)
data_sample = pd.concat([pos_df, neg_df])
del pos_df, neg_df

# shuffle the data
data_sample = data_sample.sample(frac=1, random_state=seed)

# subset columns of data df based on regex
X_df = data_sample.filter(regex=subset)

X = data_sample.filter(regex=subset)
y = data_sample["db"]

# scale the data
X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X, columns=X_df.columns)

# split train-test sets from this seed 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)



# train a randomforest classifier, set max features to 0.5
rf = RandomForestClassifier(n_estimators=250,
                            max_features=0.8, 
                            criterion="entropy",
                            random_state=seed,
                            min_samples_leaf=10,
                            max_depth=12)
rf.fit(X_train, y_train)
rf.feature_importances_

array([0.04996692, 0.0585296 , 0.32309847, 0.14822497, 0.05140692,
       0.05282734, 0.04689774, 0.03453662, 0.0315694 , 0.03411826,
       0.03103834, 0.03169341, 0.07314336, 0.03294865])

In [127]:
y_predprob = rf.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_predprob))

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

0.7511783649700702
              precision    recall  f1-score   support

           0       0.65      0.78      0.71      3252
           1       0.73      0.59      0.66      3358

    accuracy                           0.68      6610
   macro avg       0.69      0.68      0.68      6610
weighted avg       0.69      0.68      0.68      6610



In [126]:
importance_df = pd.DataFrame(rf.feature_importances_, index=X_train.columns, columns=["importance"])
importance_df = importance_df.sort_values(by="importance", ascending=False)
# feature_list_imp = list(importance_df.head(10).index)
importance_df.head(10)

Unnamed: 0,importance
pearson_ABMS_raw,0.323098
spearman_ABMS_raw,0.148225
ABMS_raw_std_dif,0.073143
manhattan_ABMS_raw,0.05853
euclidean_ABMS_umap,0.052827
cosine_ABMS_umap,0.051407
euclidean_ABMS_raw,0.049967
manhattan_ABMS_umap,0.046898
cosine_ABMS_vae,0.034537
manhattan_ABMS_vae,0.034118


## Weight Analysis Linear Model 

## Pruned Tree

In [None]:
# subset columns of data df based on choices
feature_list = ["pearson_ABMS_raw", "spearman_ABMS_raw","ABMS_raw_std_dif","ABMS_vae_std_dif", "manhattan_ABMS_raw"]
X_df = data_sample[["pearson_SCBC_raw", "ABMS_raw_std_dif"]]

X = data_sample[["pearson_SCBC_raw", "ABMS_raw_std_dif"]]
y = data_sample["db"]

# scale the data
X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X, columns=X_df.columns)

# split train-test sets from this seed 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# train a randomforest classifier, set max features to 0.5
rf2 = RandomForestClassifier(n_estimators=500,
                            max_features="sqrt", 
                            criterion="entropy",
                            random_state=seed,
                            min_samples_leaf=25,
                            max_depth=None)
rf2.fit(X_train, y_train)
rf2.feature_importances_

KeyError: "['ABMS_raw_std_diff'] not in index"

In [57]:
Y_pred2 = rf2.predict(X_test)

print(classification_report(y_test, Y_pred2))

Y_predprob2 = rf2.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, Y_predprob2))

              precision    recall  f1-score   support

           0       0.64      0.64      0.64      3283
           1       0.65      0.65      0.65      3327

    accuracy                           0.65      6610
   macro avg       0.65      0.65      0.65      6610
weighted avg       0.65      0.65      0.65      6610

0.6983442772153476


## Linear model

In [None]:
data_sample["ABMS_vae_std_dif"]

1655484    0.331571
3200652    0.304377
4615155    0.618195
2952380    0.609667
1647539    0.266003
             ...   
792418     0.892910
808293     0.485766
3003705    0.377529
3496674    0.358185
110612     0.309180
Name: ABMS_raw_std_dif, Length: 33048, dtype: float64

In [143]:
# subset columns of data df based on regex
feature_list = data_sample.columns[data_sample.columns.str.contains("ABMS")]
X_df = data_sample[feature_list]

X = data_sample[feature_list]
y = data_sample["db"]

# split train-test sets from this seed 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

classifier_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(penalty="l2", solver="liblinear", random_state=seed))
    ])

param_grid = {
        'logreg__C': [0.005,0.01,0.05,0.1]
    }
# create a cv fold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

grid = GridSearchCV(classifier_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=4, verbose=1)

grid.fit(X_train, y_train)

model_best = grid.best_estimator_
# get probabilities back for roc auc
Y_pred3 = model_best.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, Y_pred3))

Fitting 10 folds for each of 4 candidates, totalling 40 fits
0.7413114355979772


In [123]:
feature_list_imp

['pearson_ABMS_raw',
 'spearman_ABMS_raw',
 'ABMS_raw_std_dif',
 'manhattan_ABMS_raw',
 'cosine_ABMS_umap',
 'euclidean_ABMS_umap',
 'manhattan_ABMS_umap',
 'euclidean_ABMS_raw',
 'manhattan_ABMS_vae',
 'cosine_ABMS_vae']

In [47]:
y_test

12559      1
613860     1
4545307    0
2611044    1
141790     0
          ..
1121252    1
4192548    0
3717421    0
718859     1
1225442    0
Name: db, Length: 6610, dtype: int64

## SVM Radial

In [None]:
# subset columns of data df based on regex
# feature_list = ["pearson_ABMS_raw", "spearman_ABMS_raw","ABMS_raw_std_dif","ABMS_vae_std_dif", "manhattan_ABMS_raw","euclidean_ABMS_umap"]
feature_list = data_sample.columns[data_sample.columns.str.contains("ABMS")]
X_df = data_sample[feature_list]

X = data_sample[feature_list]
y = data_sample["db"]

# split train-test sets from this seed 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

svm_classifier_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("svm", SVC(kernel="rbf", probability=True, random_state=seed))
    ])

# param_grid = {
#         'svm_C': [0.05,0.1,1,10],
#         'svm_gamma': ["scale","auto","0.01","0.1","1"]
#     }  

param_grid = {
        'svm__C': [0.1,1,2,5],
        'svm__gamma': [0.001,0.05,0.01,0.1,1]
    }  
# create a cv fold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

svm_grid = GridSearchCV(svm_classifier_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=5, verbose=1)

svm_grid.fit(X_train, y_train)
model_best_svm = svm_grid.best_estimator_

# get probabilities back for roc auc
Y_pred4 = model_best_svm.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, Y_pred4))

Fitting 10 folds for each of 2 candidates, totalling 20 fits
0.7403675897985902


In [145]:
# save the model parameters
joblib.dump(svm_grid,os.getcwd() + "\\data\\processed\\svm_radial_CV_ABMS.joblib")

pd.DataFrame(svm_grid.cv_results_).to_csv(os.getcwd() + "\\data\\processed\\svm_radial_CV_ABMS.csv")
# SAVE CV results too


# os.getcwd() + "\\data\\processed\\svm_linear_CV_ABMS.joblib"

In [146]:
# subset columns of data df based on regex
# feature_list = ["pearson_ABMS_raw", "spearman_ABMS_raw","ABMS_raw_std_dif","ABMS_vae_std_dif", "manhattan_ABMS_raw","euclidean_ABMS_umap"]
feature_list = data_sample.columns[data_sample.columns.str.contains("SCBC")]
X_df = data_sample[feature_list]

X = data_sample[feature_list]
y = data_sample["db"]

# split train-test sets from this seed 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

svm_classifier_pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("svm", SVC(kernel="rbf", probability=True, random_state=seed))
    ])

# param_grid = {
#         'svm_C': [0.05,0.1,1,10],
#         'svm_gamma': ["scale","auto","0.01","0.1","1"]
#     }  

param_grid = {
        'svm__C': [0.05,0.1,1,2,5],
        'svm__gamma': [0.001,0.05,0.01,0.1,1]
    }  
# create a cv fold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

svm_grid = GridSearchCV(svm_classifier_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=6, verbose=1)

svm_grid.fit(X_train, y_train)
model_best_svm = svm_grid.best_estimator_

# get probabilities back for roc auc
Y_pred4 = model_best_svm.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, Y_pred4))

Fitting 10 folds for each of 25 candidates, totalling 250 fits
0.8096387012857621


In [None]:
# save the model parameters
joblib.dump(svm_grid,os.getcwd() + "\\data\\processed\\svm_radial_CV_SCBC.joblib")

pd.DataFrame(svm_grid.cv_results_).to_csv(os.getcwd() + "\\data\\processed\\svm_radial_CV_SCBC.csv")
# SAVE CV results too


In [1]:
pd.DataFrame(svm_grid.cv_results_).sort_values

NameError: name 'pd' is not defined

## SVM Linear

In [None]:
# subset columns of data df based on regex
# feature_list = ["pearson_ABMS_raw", "spearman_ABMS_raw","ABMS_raw_std_dif","ABMS_vae_std_dif", "manhattan_ABMS_raw","euclidean_ABMS_umap"]
seed = 236
# feature_list = list(importance_df.head(5).index)

# sample from the data and undersample the negative class by random shuffling
pos_df = data[data["db"] == 1]
neg_df = data[data["db"] == 0]
neg_df = neg_df.sample(n=pos_df.shape[0], random_state=seed)

data_sample = pd.concat([pos_df, neg_df])
del pos_df, neg_df

feature_list = data_sample.columns[data_sample.columns.str.contains("SCBC")]
X_df = data_sample[feature_list]

X = data_sample[feature_list]
y = data_sample["db"]

# split train-test sets from this seed 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

svm_linear_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", CalibratedClassifierCV(
              LinearSVC(penalty='l2', loss='squared_hinge', random_state=seed, max_iter=10000),
              cv=5))
])


param_grid = {
        'svm__base_estimator__C': [0.1,0.5,1,2,2.5,5,10]
    }  
# create a cv fold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

svm_lgrid = GridSearchCV(svm_linear_pipe, param_grid, cv=10, scoring='roc_auc', n_jobs=6, verbose=1)

svm_lgrid.fit(X_train, y_train)
model_best_svml = svm_lgrid.best_estimator_

# get probabilities back for roc auc
Y_pred5 = model_best_svml.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, Y_pred5))

Fitting 10 folds for each of 6 candidates, totalling 60 fits


  valid_params[key].set_params(**sub_params)


0.8026776147190808




In [13]:


# # save the model parameters
joblib.dump(svm_lgrid,os.getcwd() + "\\data\\processed\\svm_linear_CV_SCBC.joblib")

pd.DataFrame(svm_lgrid.cv_results_).sort_values(by="rank_test_score").to_csv(os.getcwd() + "\\data\\processed\\svm_linear_CV_SCBC.csv")
# # SAVE CV results too