## Classification Experiment

In [1]:
import collections
import json
import os
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from mne_bids import get_entities_from_fname
from natsort import natsorted
#from rerf.rerfClassifier import rerfClassifier

# comparative classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier


from sklearn.calibration import calibration_curve
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils import resample
from sklearn.utils.metaestimators import _safe_split
from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    f1_score,
    roc_curve,
    balanced_accuracy_score,
    accuracy_score,
    auc,
    brier_score_loss,
    plot_precision_recall_curve,
    precision_recall_curve,
)
from sklearn.model_selection import (
    GroupKFold,
    cross_validate,
    GroupShuffleSplit,
    StratifiedShuffleSplit
)

#import dabest
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

# import feature loading code
from sample_code.study import (
    load_patient_dict,
    determine_feature_importances,
    tune_hyperparameters,
    # extract_Xy_pairs,
    # _sequential_aggregation,
    format_supervised_dataset,
)

from sample_code.io import read_participants_tsv, load_feature_data
from sample_code.utils import _load_turbo, _plot_roc_curve, NumpyEncoder

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
include_subject_groups = {
    'non-epilepsy': 0,
    'epilepsy-normal': 1,
}
include_feature_groups = ['sourcesink', 'fragility']
cross_val_splits = 10
classify_abnormal = False

In [3]:
def average_roc(fpr, tpr):
    """Compute average ROC statistics."""
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 200)
    
    n_splits = len(fpr)
    print(f"Computing average ROC over {n_splits} CV splits")
    for i in range(n_splits):
        interp_tpr = np.interp(mean_fpr, fpr[i], tpr[i])
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(auc(mean_fpr, interp_tpr))
    
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    
    std_tpr = np.std(tprs, axis=0)
    return mean_fpr, tprs, aucs

In [4]:
def combine_patient_predictions(
    y_trues, ypred_probs, subject_groups, pat_predictions=None, pat_true=None
):
    if pat_predictions is None or pat_true is None:
        pat_predictions = collections.defaultdict(list)
        pat_true = dict()
        
    # loop through things
    for ytrue, ypred_proba, subject in zip(y_trues, ypred_probs, subject_groups):
        pat_predictions[str(subject)].append(float(ypred_proba))
        
        if subject not in pat_true:
            pat_true[str(subject)] = ytrue
        else:
            if pat_true[str(subject)] != ytrue:
                raise RuntimeError("subjects should match...")
    return pat_predictions, pat_true

## Define Data Directories

In [5]:
seed = 12345
random_state = 12345
np.random.seed(seed)

# proportion of subjects used for training
train_size = 0.6

# classification model to use
clf_type = "mtmorf"

# BIDS related directories
"""bids_root = Path("...")
deriv_path = bids_root / "derivatives"
source_path = bids_root = "sourcedata"

excel_fpath = source_path / "..."

intermed_fpath = path(deriv_path) / "baselinesliced"

# where to save results
study_path = Path(deriv_path) / "study"

# feature names
feature_names = [
    "kl_div"
]"""

'bids_root = Path("...")\nderiv_path = bids_root / "derivatives"\nsource_path = bids_root = "sourcedata"\n\nexcel_fpath = source_path / "..."\n\nintermed_fpath = path(deriv_path) / "baselinesliced"\n\n# where to save results\nstudy_path = Path(deriv_path) / "study"\n\n# feature names\nfeature_names = [\n    "kl_div"\n]'

In [6]:
# defining evaluation criterion
metric = "roc_auc"
BOOTSTRAP = False

# defining hyperparameters

ncores = -1
num_runs = 1
n_est = 500  # number of estimators


names = {
    "Log. Reg": "blue",
    #"Lin. SVM": "firebrick",
}
classifiers = [
    LogisticRegression(random_state=0, n_jobs=ncores, solver="liblinear"),
    #LinearSVC(),
]

In [7]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.python.client import device_lib

In [16]:
def load_data(
    feature_name,
    include_groups,
    deriv_path,
    excel_fpath,
    feature_names,
    json_fpath,
    patient_aggregation_method=None,
    intermed_fpath=None,
    save_cv_indices: bool=False,
):
    if feature_name == "sourcesink":
        if not intermed_fpath:
            (
                unformatted_X,
                y,
                subject_groups,
                subjects,
                ch_names,
                centers
            ) = load_feature_data("sourcesink", "npy", deriv_path, excel_fpath=excel_fpath, feature_names=feature_names, json_fpath=json_fpath, include_groups=include_groups)
        else:
            (
                unformatted_X,
                y,
                subject_groups,
                subjects,
                ch_names,
                centers
            ) = load_feature_data("sourcesink", "npy",intermed_fpath, excel_fpath=excel_fpath, feature_names=feature_names, json_fpath=json_fpath, include_groups=include_groups)
    elif feature_name == "spikes":
        if not intermed_fpath:
            (
                unformatted_X,
                y,
                subject_groups,
                subjects,
                ch_names,
                centers
            ) = load_feature_data("spikes", "json", deriv_path, excel_fpath=excel_fpath, feature_names=feature_names, json_fpath=json_fpath, include_groups=include_groups)
        else:
            (
                unformatted_X,
                y,
                subject_groups,
                subjects,
                ch_names,
                centers
            ) = load_feature_data("spikes", "json",intermed_fpath, excel_fpath=excel_fpath, feature_names=feature_names, json_fpath=json_fpath, include_groups=include_groups)
    elif feature_name == "fragility":
        if not intermed_fpath:
            (
                unformatted_X,
                y,
                subject_groups,
                subjects,
                ch_names,
                centers
            ) = load_feature_data("fragility", "npy", deriv_path, excel_fpath=excel_fpath, feature_names=feature_names, json_fpath=json_fpath, include_groups=include_groups)
        else:
            (
                unformatted_X,
                y,
                subject_groups,
                subjects,
                ch_names,
                centers
            ) = load_feature_data("fragility", "npy",intermed_fpath, excel_fpath=excel_fpath, feature_names=feature_names, json_fpath=json_fpath, include_groups=include_groups)
    else:
        print(f"{feature_name} is unknown")
        return None
    subject_groups = [int(g) for g in subject_groups]
    subject_groups = np.array(subject_groups)

    # create held-out test dataset
    # create separate pool of subjects for testing dataset
    # 1. Cross Validation Training / Testing Split
    study_path = deriv_path / "study"
    if save_cv_indices:
        sss = StratifiedShuffleSplit(n_splits=10, train_size=0.5, random_state=random_state)
        for jdx, (train_inds, test_inds) in enumerate(
            sss.split(unformatted_X, y)
        ):
            # if jdx != 7:
            #     continue
            train_pats = np.unique(subject_groups[train_inds])
            test_pats = np.unique(subject_groups[test_inds])
            save_fpath = study_path / "inds" / "fixed_folds_subjects" / f"{feature_name}-srerf-{jdx}-inds.npz"
            Path(save_fpath.parent).mkdir(parents=True, exist_ok=True)
            np.savez_compressed(
                save_fpath,
                train_inds=train_inds,
                test_inds=test_inds,
                train_pats=train_pats,
                test_pats=test_pats,
            )
    return unformatted_X, y, subject_groups, subjects, ch_names, centers
    

In [9]:
def run_clf_validation(
    clf_type,
    clf_func,
    unformatted_X,
    y,
    subject_groups,
    study_path,
    feature_name='sourcesink'
):
    unformatted_X = unformatted_X.copy()
    y = y.copy()
    subject_groups = subject_groups.copy()
    
    for jdx in range(1, 10):
        cv_scores = collections.defaultdict(list)
    
        
        with np.load(
            # study_path / "inds" / 'clinical_complexity' / f"{jdx}-inds.npz",
            study_path
            / "inds"
            / "fixed_folds_subjects"
            / f"sourcesink-srerf-{jdx}-inds.npz",
            allow_pickle=True,
        ) as data_dict:
            # train_inds, test_inds = data_dict["train_inds"], data_dict["test_inds"]
            train_pats, test_pats = data_dict["train_pats"], data_dict["test_pats"]

        # set train indices based on which subjects
        train_inds = [
            idx for idx, sub in enumerate(subject_groups) if sub in train_pats
        ]
        test_inds = [idx for idx, sub in enumerate(subject_groups) if sub in test_pats]
        
        subjects_test = subject_groups[test_inds]
        
        
        X_formatted, dropped_inds = format_supervised_dataset(
            unformatted_X
        )
        
        clf = clf_func
        print("Updated classifier: ", clf)

        # perform CV using Sklearn
        scoring_funcs = {
            "roc_auc": roc_auc_score,
            "accuracy": accuracy_score,
            "balanced_accuracy": balanced_accuracy_score,
            "average_precision": average_precision_score,
        }

        def dummy_cv(train, test):
            yield train_inds, test_inds

        n_samps = len(y)
        if isinstance(clf, KerasClassifier):
            print(X_formatted.shape)
            X_formatted = X_formatted.reshape(n_samps, 20, np.sum(window), 1)
            print("new shape: ", X_formatted.shape)
            y = y.reshape(-1, 1)
        
        
        cv = dummy_cv(train_inds, test_inds)

        scores = cross_validate(
            clf,
            X_formatted,
            y,
            groups=subject_groups,
            cv=cv,
            scoring=list(scoring_funcs.keys()),
            return_estimator=True,
            return_train_score=True,
        )

        # get the best classifier based on pre-chosen metric
        test_key = f"test_{metric}"

        # removing array like structure
        scores = {key: val[0] for key, val in scores.items()}
        estimator = scores.pop("estimator")
        
        coeff_ = estimator.coeff_
        intercept_ = estimator.intercept_
        coeff = intercept_.copy()
        [coeff.extend(c) for c in coeff]
        
        X_test, y_test = np.array(X_formatted)[test_inds, ...], np.array(y)[test_inds]
        groups_test = np.array(subject_groups)[test_inds]
        
        
        y_pred_prob = estimator.predict_proba(X_formatted)[:, 1]
        y_pred = estimator.predict(X_formatted)
        
        cv_scores["validate_ytrue"].append(list(y_test))
        cv_scores["validate_ypred_prob"].append(list(y_pred_prob))
        cv_scores["validate_ypred"].append(list(y_pred))
        cv_scores["validate_subject_groups"].append(list(groups_test))

        # store ROC curve metrics on the held-out test set
        fpr, tpr, thresholds = roc_curve(y, y_pred_prob, pos_label=1)
        fnr, tnr, neg_thresholds = roc_curve(y, y_pred_prob, pos_label=0)
        cv_scores["validate_fpr"].append(list(fpr))
        cv_scores["validate_tpr"].append(list(tpr))
        cv_scores["validate_fnr"].append(list(fnr))
        cv_scores["validate_tnr"].append(list(tnr))
        cv_scores["validate_thresholds"].append(list(thresholds))
        cv_scores["coeff"].append(list(coeff))
        
        try:
            fraction_of_positives, mean_predicted_value = calibration_curve(
                y, y_pred_prob, n_bins=10, strategy="quantile"
            )
        except Exception as e:
            try:
                print(e)
                fraction_of_positives, mean_predicted_value = calibration_curve(
                    y, y_pred_prob, n_bins=5, strategy="uniform"
                )
            except Exception as e:
                print(e)
                fraction_of_positives = [None]
                mean_predicted_value = [None]
        clf_brier_score = np.round(
            brier_score_loss(y, y_pred_prob, pos_label=np.array(y).max()), 2
        )
        
        print("Done analyzing calibration stats...")

        # store ingredients for a calibration curve
        cv_scores["validate_brier_score"].append(float(clf_brier_score))
        cv_scores["validate_fraction_pos"].append(list(fraction_of_positives))
        cv_scores["validate_mean_pred_value"].append(list(mean_predicted_value))

        pat_predictions, pat_true = combine_patient_predictions(
            y, y_pred_prob, subjects_test
        )
        cv_scores["validate_pat_predictions"].append(pat_predictions)
        cv_scores["validate_pat_true"].append(pat_true)
        
        # store output for feature importances
        if clf_type == "rf":
            n_jobs = -1
        else:
            n_jobs = 1

        if not isinstance(clf, KerasClassifier):
            results = determine_feature_importances(
                estimator, X_formatted, y, n_jobs=n_jobs
            )
            imp_std = results.importances_std
            imp_vals = results.importances_mean
            cv_scores["validate_imp_mean"].append(list(imp_vals))
            cv_scores["validate_imp_std"].append(list(imp_std))

            print("Done analyzing feature importances...")

        # save intermediate analyses
        clf_func_path = (
            study_path
            / "clf-train-vs-test"
            / "classifiers"
            / f"{clf_type}_classifiers_{feature_name}_{jdx}.npz"
        )
        clf_func_path.parent.mkdir(exist_ok=True, parents=True)

        # nested CV scores
        nested_scores_fpath = (
            study_path
            / "clf-train-vs-test"
            / f"study_cv_scores_{clf_type}_{feature_name}_{jdx}.json"
        )

        # save the estimators
        if clf_type not in ["srerf", "mtmorf"]:
            np.savez_compressed(clf_func_path, estimators=estimator)

        # save all the master scores as a JSON file
        with open(nested_scores_fpath, "w+") as fin:
            json.dump({str(k): cv_scores[k] for k in cv_scores}, fin, cls=NumpyEncoder)
            #json.dump(cv_scores, fin, cls=NumpyEncoder)

        del estimator
        del scores

In [10]:
feature_names = {
    "sourcesink": ["kldiv", "entropy", "variance", "skew", "kurtosis"],
    "spikes": ["spike_rate", "max_spikes"],
    "fragility": ["kldiv", "entropy", "variance", "skew", "kurtosis"]
}

In [17]:
deriv_path = Path("D:/OneDriveParent/OneDrive - Johns Hopkins/Shared Documents/40Hz-30/derivatives")
excel_fpath = Path("D:/ScalpData/JHU_scalp_clinical_datasheet_raw_local.xlsx")
#feature_names = ["kldiv", "entropy", "variance", "skew", "kurtosis"]
#spike_feature_names = ["spike_rate", "max_spikes"]
json_fpath = Path("D:/Desktop/ezscalp/scripts/epilepsy_inds_single.json")
study_path = deriv_path / "study"

unformatted_X = []
for ind, feature_group in enumerate(include_feature_groups):
    part_X, y, subject_groups, subjects, ch_names, centers = load_data(
        feature_group,
        include_subject_groups,
        deriv_path,
        excel_fpath,
        feature_names[feature_group],
        json_fpath,
        save_cv_indices=True
    )
    if ind == 0:
        unformatted_X_ = part_X
    else:
        unformatted_X_ = [x+p for x,p in zip(unformatted_X, part_X)]
    unformatted_X = unformatted_X_.copy()


In [None]:
merged_feature_names = ["ss-kldiv", "ss-entropy", "ss-variance", "ss-skew", "ss-kurtosis", "spike_rate", "max_spikes", "frag-kldiv", "frag-entropy", "frag-variance", "frag-skew", "frag-kurtosis"]

In [18]:
for clf_name, clf_func in zip(names, classifiers):
    run_clf_validation(
        clf_name,
        clf_func,
        unformatted_X,
        y,
        subject_groups,
        study_path,
    )

Updated classifier:  LogisticRegression(n_jobs=-1, random_state=0, solver='liblinear')
Done analyzing calibration stats...
Done analyzing feature importances...
Updated classifier:  LogisticRegression(n_jobs=-1, random_state=0, solver='liblinear')
Done analyzing calibration stats...
Done analyzing feature importances...
Updated classifier:  LogisticRegression(n_jobs=-1, random_state=0, solver='liblinear')
Done analyzing calibration stats...
Done analyzing feature importances...
Updated classifier:  LogisticRegression(n_jobs=-1, random_state=0, solver='liblinear')
Done analyzing calibration stats...
Done analyzing feature importances...




Updated classifier:  LogisticRegression(n_jobs=-1, random_state=0, solver='liblinear')
Done analyzing calibration stats...
Done analyzing feature importances...
Updated classifier:  LogisticRegression(n_jobs=-1, random_state=0, solver='liblinear')
Done analyzing calibration stats...
Done analyzing feature importances...
Updated classifier:  LogisticRegression(n_jobs=-1, random_state=0, solver='liblinear')
Done analyzing calibration stats...
Done analyzing feature importances...
Updated classifier:  LogisticRegression(n_jobs=-1, random_state=0, solver='liblinear')
Done analyzing calibration stats...
Done analyzing feature importances...
Updated classifier:  LogisticRegression(n_jobs=-1, random_state=0, solver='liblinear')




Done analyzing calibration stats...
Done analyzing feature importances...




In [None]:
def dummy_cv(train, test):
    yield train_inds, test_inds

In [None]:
if==0:
    jdx = 8
    with np.load(
        # study_path / "inds" / 'clinical_complexity' / f"{jdx}-inds.npz",
        study_path
        / "inds"
        / "fixed_folds_subjects"
        / f"sourcesink-srerf-{jdx}-inds.npz",
        allow_pickle=True,
    ) as data_dict:
        # train_inds, test_inds = data_dict["train_inds"], data_dict["test_inds"]
        train_pats, test_pats = data_dict["train_pats"], data_dict["test_pats"]

    # set train indices based on which subjects
    train_inds = [
        idx for idx, sub in enumerate(subject_groups) if sub in train_pats
    ]
    test_inds = [idx for idx, sub in enumerate(subject_groups) if sub in test_pats]


In [None]:
if==0:
    X, dropped_inds = format_supervised_dataset(
                unformatted_X
            )
    clf = LogisticRegression(random_state=0, n_jobs=ncores, solver="liblinear")
    X_train, y_train = _safe_split(clf, X, y, train_inds)
    X_test, y_test = _safe_split(clf, X, y, test_inds, train_inds)
    clf.fit(X_formatted, y)
    clf.coef_