# Validation Experiment

This is a much simplified version of previous validation notebooks. You load the data in, then there is a cell for setting the run parameters, then you run everything in a loop.

Feature creation can now come in two forms: time-averaged or PCA. Both methods are used to reduce the dimensionality of the original heatmap so that features can be generated from a reduced dataset.

The feature generation function now takes kwargs so that you can specify how the PCA is done (i.e. how many components are kept) or how the features are calculated (i.e. splitting lobe calculations by hemisphere)

In [1]:
# %load_ext nb_black

In [2]:
import numpy as np
import scipy
import scipy.io
import pandas as pd
from pathlib import Path
import os
import collections
from natsort import natsorted
import json
import pickle
import warnings
import sys
from numpy import interp
from pprint import pprint

from numpy.testing import assert_array_equal

warnings.filterwarnings("ignore")

from sklearn.preprocessing import OrdinalEncoder, LabelBinarizer, LabelEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import calibration_curve
from sklearn.metrics import (
    brier_score_loss,
    roc_curve,
    average_precision_score,
    roc_auc_score,
    f1_score,
    recall_score,
    jaccard_score,
    balanced_accuracy_score,
    accuracy_score,
    auc,
    precision_score,
    plot_precision_recall_curve,
    average_precision_score,
    precision_recall_curve,
    confusion_matrix,
    cohen_kappa_score,
    make_scorer,
    precision_recall_fscore_support,
)
from sklearn.inspection import permutation_importance
from sklearn.model_selection import (
    StratifiedGroupKFold,
    cross_validate,
    StratifiedShuffleSplit,
    LeaveOneGroupOut,
)
from sklearn.utils import resample
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import make_pipeline

import mne
from mne.time_frequency import read_tfrs

mne.set_log_level("ERROR")
from mne_bids import BIDSPath, get_entities_from_fname, get_entity_vals, read_raw_bids

from eztrack.io import read_derivative_npy

sys.path.append("../../")
# sys.path.append("..\episcalp")
from episcalp.features import spike_feature_vector, heatmap_features, heatmap_features2
from episcalp.io.read import (
    load_persyst_spikes,
    load_reject_log,
    load_derivative_heatmaps,
    map_rejectlog_to_deriv,
)
from episcalp.preprocess.montage import _standard_lobes
from episcalp.utils.utils import NumpyEncoder
from episcalp.cross_validate import exclude_subjects

# if you installed sporf via README
# from oblique_forests.sporf import ObliqueForestClassifier

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re

%load_ext autoreload
%autoreload 2
%matplotlib inline

# Define possible helper functions

In [3]:
def _get_exp_condition(subject, root):
    part_fname = os.path.join(root, "participants.tsv")
    df = pd.read_csv(part_fname, sep="\t")

    if not subject.startswith("sub-"):
        subject = f"sub-{subject}"

    return df[df["participant_id"] == subject]

In [4]:
def convert_experimental_cond_to_y(experimental_condition_list):
    """Encoder for y labels."""
    # Group name keys, assigned y-label values
    experimental_condition_map = {
        "non-epilepsy-normal-eeg": 0,
        "epilepsy-normal-eeg": 1,
        "epilepsy-abnormal-eeg": 2,
    }
    return [experimental_condition_map[cond] for cond in experimental_condition_list]

In [5]:
def _preprocess_epochs_tfr(data):
    """Turn TFR data into a 2D array."""
    assert data.ndim == 4

    # take the average over frequencies
    data = np.mean(data, axis=2)

    # move the epoch ("window") axis to last
    data = np.moveaxis(data, 0, -2)

    # compress the time axis
    data = np.mean(data, axis=-1)

    # convert to dB
    data = 20 * np.log10(data)

    data = np.reshape(data, (data.shape[0], -1))
    return data

In [6]:
def combine_datasets(deriv_dataset):
    dataset = deriv_dataset[0]
    for deriv in deriv_dataset:
        for key in deriv.keys():
            if key not in dataset.keys():
                raise RuntimeError(
                    f"All keys in {dataset.keys()} must match every other derived dataset. "
                    f"{key}, {deriv.keys()}."
                )

    # convert to a dictionary of lists
    derived_dataset = {key: [] for key in dataset.keys()}
    for deriv in deriv_dataset:
        for key in derived_dataset.keys():
            derived_dataset[key].extend(deriv[key])
    return derived_dataset

# Define Data Directories

In [7]:
user = "kristin"
if user == "patrick":
    jhroot = Path("D:/OneDriveParent/OneDrive - Johns Hopkins/Shared Documents/bids")
    jeffroot = Path("D:/OneDriveParent/Johns Hopkins/Jefferson_Scalp - Documents/root")

    # not ready yet
    upmcroot = Path("/Users/adam2392/Johns Hopkins/UPMC_Scalp - Documents/")
    deriv_dir = Path(
        "D:/OneDriveParent/OneDrive - Johns Hopkins/Shared Documents/derivatives"
    )
elif user == "adam":
    jhroot = Path("/Users/adam2392/Johns Hopkins/Scalp EEG JHH - Documents/bids/")
    jeffroot = Path("/Users/adam2392/Johns Hopkins/Jefferson_Scalp - Documents/root/")

    # not ready yet
    upmcroot = Path("/Users/adam2392/Johns Hopkins/UPMC_Scalp - Documents/")
    deriv_dir = Path(
        "/Users/adam2392/Johns Hopkins/Scalp EEG JHH - Documents/derivatives"
    )
elif user == "kristin":
#     jhroot = Path("/Users/Kristin/OneDrive - Johns Hopkins/Documents - Scalp EEG JHH/bids/")
    jhroot = Path("D:/kgunnar1/OneDrive - Johns Hopkins/Documents - Scalp EEG JHH/bids")
#     jeffroot = Path("/Users/Kristin/OneDrive - Johns Hopkins/Documents - Jefferson_Scalp/root/")
    jeffroot = Path("D:/kgunnar1/OneDrive - Johns Hopkins/Documents - Jefferson_Scalp/root")  

    # not ready yet
#     upmcroot = Path("/Users/Kristin/OneDrive - Johns Hopkins/Documents - UPMC_Scalp")
    upmcroot = Path("D:/kgunnar1/OneDrive - Johns Hopkins/Documents - UPMC_Scalp")
#     deriv_dir = Path("/Users/Kristin/OneDrive - Johns Hopkins/Documents - Scalp EEG JHH/derivatives")
    deriv_dir = Path("D:/kgunnar1/OneDrive - Johns Hopkins/Documents - Scalp EEG JHH/derivatives")

In [8]:
bids_roots = [jhroot, jeffroot]

In [9]:
reference = "monopolar"
radius = "1.25"

# define different derivative chains
ss_deriv_chain = Path("sourcesink") / "win-500" / "step-250" / reference
frag_deriv_chain = (
    Path("fragility") / f"radius{radius}" / "win-500" / "step-250" / reference
)

delta_tfr_deriv_chain = Path("tfr") / "delta"
theta_tfr_deriv_chain = Path("tfr") / "theta"
alpha_tfr_deriv_chain = Path("tfr") / "alpha"
beta_tfr_deriv_chain = Path("tfr") / "beta"

In [10]:
heatmap_reprs = [
    ss_deriv_chain,
    frag_deriv_chain,
    delta_tfr_deriv_chain,
    theta_tfr_deriv_chain,
    alpha_tfr_deriv_chain,
    beta_tfr_deriv_chain,
]


# Load The Data (Once)

In [11]:
# load fragility data
datasets = []
kwargs = {"preload": "True"}
for root in bids_roots:
    print(f"Loading fragility for {root}")
    dataset = load_derivative_heatmaps(
        root / "derivatives" / frag_deriv_chain,
        search_str="*desc-perturbmatrix*.npy",
        read_func=read_derivative_npy,
        subjects=None,
        verbose=False,
        source_check=False,
        **kwargs
    )
    datasets.append(dataset)
fragility_dataset = combine_datasets(datasets)
print(len(dataset["subject"]))
print(len(fragility_dataset["subject"]))


Loading fragility for D:\kgunnar1\OneDrive - Johns Hopkins\Documents - Scalp EEG JHH\bids
Loading fragility for D:\kgunnar1\OneDrive - Johns Hopkins\Documents - Jefferson_Scalp\root
112
233


In [12]:
# load all three SS module data
datasets = []
for root in bids_roots:
    dataset = load_derivative_heatmaps(
        root / "derivatives" / ss_deriv_chain,
        search_str="*desc-ssindmatrix*.npy",
        read_func=read_derivative_npy,
        subjects=None,
        verbose=False,
        source_check=False,
    )
    datasets.append(dataset)
ss_dataset = combine_datasets(datasets)


# load all three SS module data
datasets = []
for root in bids_roots:
    dataset = load_derivative_heatmaps(
        root / "derivatives" / ss_deriv_chain,
        search_str="*desc-sourceinflmatrix*.npy",
        read_func=read_derivative_npy,
        subjects=None,
        verbose=False,
        source_check=False,
    )
    datasets.append(dataset)
sourceinfl_dataset = combine_datasets(datasets)

# load all three SS module data
datasets = []
for root in bids_roots:
    dataset = load_derivative_heatmaps(
        root / "derivatives" / ss_deriv_chain,
        search_str="*desc-sinkconn*.npy",
        read_func=read_derivative_npy,
        subjects=None,
        verbose=False,
        source_check=False,
    )
    datasets.append(dataset)
sinkconn_dataset = combine_datasets(datasets)

# load all three SS module data
datasets = []
for root in bids_roots:
    dataset = load_derivative_heatmaps(
        root / "derivatives" / ss_deriv_chain,
        search_str="*desc-sinkind*.npy",
        read_func=read_derivative_npy,
        subjects=None,
        verbose=False,
        source_check=False,
    )
    datasets.append(dataset)
sinkind_dataset = combine_datasets(datasets)

In [13]:
print(len(dataset["subject"]))
print(len(ss_dataset["subject"]))
print(len(sinkind_dataset["subject"]))
print(len(sinkconn_dataset["subject"]))
print(len(sourceinfl_dataset["subject"]))

112
233
233
233
233


In [14]:
read_tfrs_lamb = lambda x: read_tfrs(x)[0]

# load TFR data
datasets = []
for root in bids_roots:
    freq_band = "delta"
    dataset = load_derivative_heatmaps(
        root / "derivatives" / delta_tfr_deriv_chain,
        search_str=f"*desc-{freq_band}*.h5",
        read_func=read_tfrs_lamb,
        subjects=None,
        verbose=False,
    )
    datasets.append(dataset)
delta_dataset = combine_datasets(datasets)

# load TFR data
datasets = []
for root in bids_roots:
    freq_band = "theta"
    dataset = load_derivative_heatmaps(
        root / "derivatives" / theta_tfr_deriv_chain,
        search_str=f"*desc-{freq_band}*.h5",
        read_func=read_tfrs_lamb,
        subjects=None,
        verbose=False,
    )
    datasets.append(dataset)
theta_dataset = combine_datasets(datasets)

# load TFR data
datasets = []
for root in bids_roots:
    freq_band = "alpha"
    dataset = load_derivative_heatmaps(
        root / "derivatives" / alpha_tfr_deriv_chain,
        search_str=f"*desc-{freq_band}*.h5",
        read_func=read_tfrs_lamb,
        subjects=None,
        verbose=False,
    )
    datasets.append(dataset)
alpha_dataset = combine_datasets(datasets)

# load TFR data
datasets = []
for root in bids_roots:
    freq_band = "beta"
    dataset = load_derivative_heatmaps(
        root / "derivatives" / beta_tfr_deriv_chain,
        search_str=f"*desc-{freq_band}*.h5",
        read_func=read_tfrs_lamb,
        subjects=None,
        verbose=False,
    )
    datasets.append(dataset)
beta_dataset = combine_datasets(datasets)

## Define Sets of Data

In [15]:
metric_mapping = {
    "fragility": fragility_dataset,
    #     "ss": ss_dataset,
    "sourceinfl": sourceinfl_dataset,
    "sinkind": sinkind_dataset,
    "sinkconn": sinkconn_dataset,
    "delta": delta_dataset,
    "theta": theta_dataset,
    "alpha": alpha_dataset,
    "beta": beta_dataset,
}

# Define Run parameters

In [16]:
user = "kristin"
# Pass a list of features for average and pca summary methods
feature_type_dict = {
    "average": ["lobes"],
    "pca": [],
    "variance": [],
    "singular_values": []
    #     "singular_values": ["first_n"]
}
# Pass any kwargs possibly needed
# TODO: expand to allow things like only returning the mean per lobe
kwargs = {
    "n_keep": 2,
    # "n_components": 2,
    "separate_hemispheres": False,
}

# List of metrics to include
metric_names = [
    "fragility",
    #     "ss",
    "sinkind",
    "sinkconn",
    "sourceinfl",
    "delta",
    "theta",
    "alpha",
    "beta",
]

winsize = 500
stepsize = 250
radius = 1.25
reference = "monopolar"

# Binary exclusion criteria - columns from the participants.tsv
categorical_exclusion_criteria = {
    "exp_condition": ["epilepsy-abnormal-eeg"],
    "final_diagnosis": None,
    "epilepsy_type": ["generalized"],
    "epilepsy_hemisphere": None,
    "epilepsy_lobe": None,
}
continuous_exclusion_criteria = {
    "age": None,
    "num_aeds": None,
}

# Cross validation parameters
n_splits = 20
train_size = 0.7
random_state = 12345

# Define Classifier parameters
clf_name = "lr"
rf_model_params = {
    "n_estimators": 1000,
    "n_jobs": -1,
    "random_state": random_state,
}
lr_model_params = {
    "n_jobs": -1,
    "random_state": random_state,
    "penalty": "l1",
    "solver": "liblinear",
}


exp_name = "heatmap_feats2_lobes_avg"

# Run Experiment in a Loop

In [17]:
scaler = StandardScaler()
y_enc = LabelBinarizer()

In [18]:
stratified_cv = StratifiedShuffleSplit(
    n_splits=n_splits, train_size=train_size, random_state=random_state,
)
log_cv = LeaveOneGroupOut()
# cv = BootstrapSplit(n_splits=100, random_state=random_state)

cvs = {
    "stratifiedshuffle": stratified_cv,
    #        "leaveonesubout": log_cv
}

In [19]:
scoring_funcs = {
    "balanced_accuracy": make_scorer(balanced_accuracy_score),
    "cohen_kappa_score": make_scorer(cohen_kappa_score),
    "roc_auc": "roc_auc",  # roc_auc_score,
    "f1": "f1",  # f1_score,
    "recall": "recall",  # makerecall_score,
    "precision": "precision",  # precision_score,
    "jaccard": "jaccard",  # jaccard_score,
    "average_precision": "average_precision",  # average_precision_score,
    "neg_brier_score": "neg_brier_score",  # brier_score_loss,
}

scoring = scoring_funcs
print(scoring)

{'balanced_accuracy': make_scorer(balanced_accuracy_score), 'cohen_kappa_score': make_scorer(cohen_kappa_score), 'roc_auc': 'roc_auc', 'f1': 'f1', 'recall': 'recall', 'precision': 'precision', 'jaccard': 'jaccard', 'average_precision': 'average_precision', 'neg_brier_score': 'neg_brier_score'}


In [20]:
fname = deriv_dir / "normaleeg" / clf_name / f"{exp_name}_features.csv"
fname.parent.mkdir(exist_ok=True, parents=True)

print(f"File {fname} exists {fname.exists()}")

File D:\kgunnar1\OneDrive - Johns Hopkins\Documents - Scalp EEG JHH\derivatives\normaleeg\lr\heatmap_feats2_lobes_avg_features.csv exists False


In [21]:
import itertools

idx = 0
dfs = []
for i in range(1, len(metric_names) + 1):
    for names in itertools.combinations(metric_names, i):
        print(f"Using metrics: {names}")
        # create feature matrix
        features = []
        for idx in range(len(fragility_dataset["subject"])):
            feature_vec = []
            for name in names:
                dataset = metric_mapping[name].copy()
                # extract data and form feature vector
                data = dataset["data"][idx]
                ch_names = dataset["ch_names"][idx]
                for summary_method, feature_types in feature_type_dict.items():
                    if feature_types:
                        _feature_vec = heatmap_features2(
                            data,
                            ch_names=ch_names,
                            types=feature_types,
                            summary_method=summary_method,
                            **kwargs,
                        )
                        if _feature_vec:
                            feature_vec.extend(_feature_vec)
            features.append(feature_vec)

        features = np.array(features)

        dataset = fragility_dataset

        # get the y-labels
        subjects = np.array(dataset["subject"])
        roots = dataset["roots"]

        # get the experimental conditions
        exp_conditions = []
        for subject, root in zip(subjects, roots):
            subj_df = _get_exp_condition(subject, root)
            exp_condition = subj_df["exp_condition"].values[0]
            exp_conditions.append(exp_condition)

        # encode y label
        y = y_enc.fit_transform(exp_conditions)
        y_classes = y_enc.classes_
        y = np.array(convert_experimental_cond_to_y(np.array(exp_conditions)))
        X = features
        # Further subset the subjects if desired
        X, y, keep_subjects = exclude_subjects(
            X,
            y,
            subjects,
            bids_roots,
            categorical_exclusion_criteria,
            continuous_exclusion_criteria,
        )

        max_features = X.shape[1]
        if not rf_model_params.get("max_features"):
            rf_model_params["max_features"] = max_features

        if clf_name == "rf":
            clf = RandomForestClassifier(**rf_model_params)
        elif clf_name == "sporf":
            # only used if you installed cysporf
            clf = ObliqueForestClassifier(**rf_model_params)
        elif clf_name == "lr":
            clf = LogisticRegression(**lr_model_params)

        # for multiclass
        # clf = OneVsRestClassifier(clf)

        steps = []
        if clf_name == "lr":
            steps.append(StandardScaler())
        steps.append(clf)

        clf = make_pipeline(*steps)

        # fit on entire dataset
        clf.fit(X, y)

        scoring_funcs = {
            "balanced_accuracy": balanced_accuracy_score,
            "cohen_kappa_score": cohen_kappa_score,
            "roc_auc": roc_auc_score,  #  "roc_auc",  # roc_auc_score,
            "f1": f1_score,
            "recall": recall_score,
            "specificity": recall_score,
            "precision": precision_score,
            "jaccard": jaccard_score,
            "average_precision": average_precision_score,
            "neg_brier_score": brier_score_loss,
            "cohen_kappa_score": cohen_kappa_score,
            #     'specificity': '',
        }

        # evaluate the model performance
        train_scores = dict()
        for score_name, score_func in scoring_funcs.items():
            y_pred_proba = clf.predict_proba(X)
            if score_name == "specificity":
                score_func = make_scorer(score_func, pos_label=0)
            else:
                score_func = make_scorer(score_func)
            score = score_func(clf, X, y)

            train_scores[score_name] = score

        for idx in np.unique(y):
            print(f"Class {idx} has ", len(np.argwhere(y == idx)))
        y_pred = clf.predict(X)

        cv_scores = {}
        for cv_name, cv in cvs.items():
            # run cross-validation
            scores = cross_validate(
                clf,
                X,
                y,
                groups=keep_subjects,
                cv=cv,
                scoring=scoring,
                return_estimator=True,
                return_train_score=False,
                n_jobs=-1,
                error_score="raise",
            )

            # get the estimators
            estimators = scores.pop("estimator")
            cv_scores[cv_name] = scores

        result_df = pd.DataFrame()
        idx = 0

        result_df["exp"] = ""
        result_df.at[1, "exp"] = idx
        result_df["heatmaps"] = ""
        result_df.at[1, "heatmaps"] = str(names)
        result_df["data_shape"] = str(X.shape)
        result_df["n_splits"] = n_splits
        result_df["n_classes"] = len(y_enc.classes_)
        result_df["clf"] = clf_name

        for name, score in train_scores.items():
            result_df[f"train_{name}"] = score

        for name, scores in cv_scores.items():
            for metric, score in scores.items():
                if not metric.startswith("test_"):
                    continue

                result_df[f"{name}_{metric}"] = ""
                result_df.at[1, f"{name}_{metric}"] = score
                result_df[f"{name}_{metric}_avg"] = np.mean(score)
                result_df[f"{name}_{metric}_std"] = np.std(score)
        dfs.append(result_df)

result_df = pd.concat(dfs)
display(result_df)
result_df.to_csv(fname, index=None)

Using metrics: ('fragility',)
Class 0 has  80
Class 1 has  53
Using metrics: ('sinkind',)
Class 0 has  80
Class 1 has  53
Using metrics: ('sinkconn',)


KeyboardInterrupt: 

In [None]:
print(y.shape)
print(y)
print("done")

# Plotting Results

In [None]:
clf_name = "lr"
# exp_name = "singularvalue-first2"
fname = deriv_dir / "normaleeg" / clf_name / f"{exp_name}_features.csv"
print(fname)

result_df = pd.read_csv(fname, index_col=None)

In [None]:
print(result_df.shape)
# melt the columns

In [None]:
display(result_df.head())

In [None]:
fig, ax = plt.subplots()

y = "train_roc_auc"
x = np.arange(len(result_df))
ax.plot(x, result_df[y], "*")
ax.set(title=f"{clf_name} ", xlabel="Exp indices", ylabel=y)
ax.axhline([0.5], ls="--")

In [None]:
fig, ax = plt.subplots()

y = "stratifiedshuffle_test_roc_auc_avg"
x = np.arange(len(result_df))
ax.plot(x, result_df[y], "*")
ax.set(title=f"{clf_name} ", xlabel="Exp indices", ylabel=y)
ax.axhline([0.5], ls="--")

# Get feature Names of top Performers

In [None]:
y = "train_roc_auc"
ordered_index = np.argsort(result_df[y])[::-1]
keep = ordered_index[:10]
names = result_df["heatmaps"][keep].values

print(result_df[y][ordered_index[:10]])
print(f"Best names: {names}")

In [None]:
y = "stratifiedshuffle_test_roc_auc_avg"
ordered_index = np.argsort(result_df[y])[::-1]
names = result_df["heatmaps"][ordered_index[:10]].values

print(result_df[y][ordered_index[:10]])
pprint(f"Best names: {names}")