# Run Classification Experiment of Feature Heatmaps - Epilepsy Scalp EEG

Here, we run through a pipeline for performing the classification experiment on feature heatmaps.

This relies on a few files being up-to-date:

- participants.tsv: contains metadata per subject

In [2]:
%load_ext lab_black

In [32]:
import collections
import json
import os
from itertools import product
from pathlib import Path
import sys
import pandas as pd
import numpy as np
from mne_bids import get_entities_from_fname, get_entity_vals

from natsort import natsorted

# from rerf.rerfClassifier import rerfClassifier

# comparative classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
    f1_score,
    roc_curve,
    balanced_accuracy_score,
    accuracy_score,
    auc,
    brier_score_loss,
    plot_precision_recall_curve,
    average_precision_score,
    precision_recall_curve,
)
from sklearn.model_selection import (
    GroupKFold,
    cross_validate,
    StratifiedKFold,
    RepeatedStratifiedKFold,
    StratifiedShuffleSplit,
)
from sklearn.utils import resample
from sklearn.calibration import calibration_curve

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

sys.path.append("../")
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Specify where to find data

In [24]:
data_path = Path("/Users/adam2392/Johns Hopkins/Scalp EEG JHH - Documents/")

bids_path = data_path / "40Hz-30"
deriv_root = bids_path / "derivatives"

# analysis parameters
reference = "monopolar"

# type of derived data
deriv_chain = Path("fragility") / "radius1.25" / reference

# where to read subject-specific metadata
participants_fname = bids_path / "participants.tsv"

In [26]:
# get all derived data subjects
subjs = get_entity_vals((deriv_root / deriv_chain).as_posix(), "subject")

print(f"Analyzed {len(subjs)} subjects")

Analyzed 22 subjects


In [35]:
# get a generator of all derived datasets
# here, we find it by using the extension `.npy`. Each unique `.npy` file
# defines another snapshot
# Maybe change according to your dataset.
deriv_fpaths = (deriv_root / deriv_chain).rglob("*desc-perturbmatrix_eeg.npy")

# Read in Clinical Metadata

In [27]:
part_df = pd.read_csv(participants_fname, delimiter="\t")

Unnamed: 0,participant_id,age,sex,hand
0,sub-001,,,
1,sub-002,,,
2,sub-003,,,
3,sub-004,,,
4,sub-005,,,


# Instantiate Classification Model

In [None]:
n_jobs = -1
num_runs = 1
n_est = 500  # number of estimators

max_depth = None
max_features = "auto"
IMAGE_HEIGHT = 20
IMAGE_WIDTH = 1

model_params = {
    "n_estimators": 500,
    "max_depth": max_depth[0],
    "max_features": max_features[0],
    "n_jobs": n_jobs,
    "random_state": random_state,
#     "projection_matrix": "RerF",
#     "projection_matrix": "S-RerF",
    "projection_matrix": "MT-MORF",
    "image_height": IMAGE_HEIGHT,
    "image_width": np.abs(windows[0]).sum(),
    "patch_height_max": 4,
    "patch_height_min": 1,
    "patch_width_max": 8,
    "patch_width_min": 1,
}

clf = rerfClassifier(**model_params)

# Instatiate Scoring Functions

In [36]:
# perform CV using Sklearn
# and keep track of these metrics
scoring_funcs = {
    "roc_auc": roc_auc_score,
    "accuracy": accuracy_score,
    "balanced_accuracy": balanced_accuracy_score,
    "average_precision": average_precision_score,
    "brier_score": brier_score_loss,
}

# Run Classification Experiment

In [33]:
# set seed and randomness for downstream reproducibility
seed = 12345
random_state = 12345
np.random.seed(seed)
n_jobs = -1

# proportion of subjects to use for training
train_size = 0.6

# format supervised learning datasets
# # define preprocessing to convert labels/groups into numbers
# enc = OrdinalEncoder()  # handle_unknown='ignore', sparse=False
# #     subject_groups = enc.fit_transform(np.array(subjects)[:, np.newaxis])
# y = enc.fit_transform(np.array(y)[:, np.newaxis])
# subject_groups = np.array(subject_groups)

# get a stratified K fold
cv = StratifiedShuffleSplit(
    n_splits=10, train_size=train_size, random_state=random_state
)

In [None]:
# load in X and y
for fpath in deriv_fpaths:
    # get entities from fname
    entities = get_entities_from_fname(fpath.name)
    subject = entities['subject']
    
    # get the label for this subject
    
    # transform the dataset into necessary parameter

In [None]:
for jdx, (train_inds, test_inds) in enumerate(gss.split(X, y)):

In [None]:
scores = cross_validate(
    clf,
    X_formatted,
    y,
    groups=subject_groups,
    cv=cv,
    scoring=list(scoring_funcs.keys()),
    return_estimator=False,
    return_train_score=True,
    n_jobs=n_jobs,
)