In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import scipy as sp
import scipy.stats
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools
import re
res_digit = r'[0-9]'

# fourier transform
from scipy.fft import fft, ifft

from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE



In [2]:
# This is a hack to make the library in the parent folder available for imoprts
# A better solution is by np8 here:
# https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
import sys
import os
import inspect

thisdir = sys.path[0]
print(f"thisdir = {thisdir}")
parentdir = os.path.dirname(thisdir)
#print(f"parentdir = {parentdir}")
if not parentdir in sys.path:
    print("Adding parent directory to python path")
    sys.path.insert(1, parentdir)
else:
    print("Skipping adding parent direct to path (there already)")

print(f"sys.path =\n{sys.path}")



thisdir = /home/luke/git/external/predicament/notebooks
Adding parent directory to python path
sys.path =
['/home/luke/git/external/predicament/notebooks', '/home/luke/git/external/predicament', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/luke/.local/lib/python3.10/site-packages', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/lib/python3.10/dist-packages']


In [3]:
## ensure relative path to data directory is sound
# for the notebook we need to modify the BASE_DATA_FOLDER
import os 
os.environ['PREDICAMENT_DATA_DIR'] =  '../data'

from predicament.utils.config import FEATURED_BASE_PATH

In [4]:
from predicament.utils.file_utils import load_dataframe_and_config

from predicament.evaluation.balancing import get_group_label_counts
from predicament.evaluation.balancing import balance_data
from predicament.evaluation.grouping import get_group_assignments
from predicament.evaluation.staging import get_design_matrix
from predicament.evaluation.results import output_model_best_from_results
from predicament.evaluation.results import save_results_df_to_file


## Load features

Before running this, you will need to generate featured data. See README file for details. For the variable `subdir` below replace this with the subdirectory name of the featured data.

In [5]:
subdir = '20231206193533'
featured_data_dir = os.path.join(FEATURED_BASE_PATH,subdir)

featured_df, featured_config = load_dataframe_and_config(
    featured_data_dir, 'featured.csv')
n_channels = int(featured_config['LOAD']['n_channels'])
channels = json.loads(featured_config['LOAD']['channels'].replace("'",'"'))
participant_list = json.loads(featured_config['LOAD']['participant_list'].replace("'",'"'))
Fs = int(featured_config['LOAD']['sample_rate'])
window_size = int(featured_config['WINDOWED']['window_size'])
time = window_size/Fs
print(f"Fs: {Fs}, n_samples = {window_size}, time: {time}s, n_channels: {n_channels}")


Fs: 64, n_samples = 256, time: 4.0s, n_channels: 7


In [6]:
featured_df.columns

Index(['participant', 'condition', 'start time', 'Mean0', 'Mean1', 'Mean2',
       'Mean3', 'Mean4', 'Mean5', 'Mean6',
       ...
       'FreqKurtosis4', 'FreqKurtosis5', 'FreqKurtosis6',
       'LempelZivEntropy[b=1][0]', 'LempelZivEntropy[b=1][1]',
       'LempelZivEntropy[b=1][2]', 'LempelZivEntropy[b=1][3]',
       'LempelZivEntropy[b=1][4]', 'LempelZivEntropy[b=1][5]',
       'LempelZivEntropy[b=1][6]'],
      dtype='object', length=157)

In [7]:
is_balanced = True
if is_balanced:
    # balance featured data
    subject_condition_counts = get_group_label_counts(featured_df, 'participant', 'condition')
    print(f"before balancing: subject_condition_counts = {subject_condition_counts}")
    featured_df = balance_data(featured_df, group_col='participant', label_col='condition')
    subject_condition_counts = get_group_label_counts(featured_df, 'participant', 'condition')

    print(f"after balancing: subject_condition_counts = {subject_condition_counts}")

before balancing: subject_condition_counts = [[ 833.  473.  473.  473.  473.]
 [ 833.  113.  353.  473.  353.]
 [ 833.  233.  473.  473.  473.]
 [ 833.  593.  353.  233.  473.]
 [   0.  113.    0. 1073.  353.]
 [ 853.  433.    0.  517.  373.]
 [ 833.  593.  473.  353.   89.]
 [ 833.  593.    0.  353.  353.]
 [ 823.  333.    0.  565.  357.]
 [ 953.  473.  353.  473.  353.]
 [ 713.  593.  473.  353.  473.]]
after balancing: subject_condition_counts = [[296. 312. 473. 272. 287.]
 [296. 113. 353. 272. 287.]
 [296. 233. 473. 272. 287.]
 [296. 312. 353. 233. 287.]
 [  0. 113.   0. 272. 287.]
 [296. 312.   0. 272. 287.]
 [296. 312. 473. 272.  89.]
 [296. 312.   0. 272. 287.]
 [296. 312.   0. 272. 287.]
 [296. 312. 353. 272. 287.]
 [296. 312. 473. 272. 287.]]


## Prepare Parameter Search Experiment

In [8]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer


In [9]:
# model name
name = 'RandomForest'

# extract input data
feature_types, feature_names, designmtx = get_design_matrix(featured_df)
# extract labels
condition_data = featured_df['condition'].values.astype(int)

# prepare Hold one group out cross validation
held_out = 'participant'
held_out, groups, group_assignments = get_group_assignments(featured_df)
n_groups = len(groups)
# cross validation splits    
group_kfold = GroupKFold(n_splits=n_groups)

# choose which search to perform
# search_type = 'random_search'
search_type = 'bayesian_optimization'
# number of iterations for your search
n_iter = 50

feature_type_pairs = [('participant', 'participant'), ('condition', 'condition'), ('start time', 'start'), ('Mean0', 'Mean'), ('Mean1', 'Mean'), ('Mean2', 'Mean'), ('Mean3', 'Mean'), ('Mean4', 'Mean'), ('Mean5', 'Mean'), ('Mean6', 'Mean'), ('SD0', 'SD'), ('SD1', 'SD'), ('SD2', 'SD'), ('SD3', 'SD'), ('SD4', 'SD'), ('SD5', 'SD'), ('SD6', 'SD'), ('MAD0', 'MAD'), ('MAD1', 'MAD'), ('MAD2', 'MAD'), ('MAD3', 'MAD'), ('MAD4', 'MAD'), ('MAD5', 'MAD'), ('MAD6', 'MAD'), ('Max0', 'Max'), ('Max1', 'Max'), ('Max2', 'Max'), ('Max3', 'Max'), ('Max4', 'Max'), ('Max5', 'Max'), ('Max6', 'Max'), ('Min0', 'Min'), ('Min1', 'Min'), ('Min2', 'Min'), ('Min3', 'Min'), ('Min4', 'Min'), ('Min5', 'Min'), ('Min6', 'Min'), ('Energy0', 'Energy'), ('Energy1', 'Energy'), ('Energy2', 'Energy'), ('Energy3', 'Energy'), ('Energy4', 'Energy'), ('Energy5', 'Energy'), ('Energy6', 'Energy'), ('IQR0', 'IQR'), ('IQR1', 'IQR'), ('IQR2', 'IQR'), ('IQR3', 'IQR'), ('IQR4', 'IQR'), ('IQR5', 'IQR'), ('IQR6', 'IQR'), ('Correlation0', '

## Random Search

If `search_type == 'random_search'`, we use a random grid to search for best hyperparameters


In [10]:
if search_type == 'random_search':
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False] 
    # Create the random grid
    random_grid = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap}

    # First create the base model to tune
    estimator = RandomForestClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    param_search = RandomizedSearchCV(
        estimator = estimator,
        param_distributions = random_grid,
        n_iter = n_iter,
        cv=group_kfold,
        verbose=2,
        random_state=42,
        n_jobs = -1)

    # Fit the random search model
    _ = param_search.fit(X=designmtx, y=condition_data, groups=group_assignments)


## Bayesian Optimisation

If `search_type == 'bayesian_optimization'`, we use a Bayesian Optimisation search for best hyperparameters. Should be quicker and more effective than random search

In [12]:
# Bayesian optimisation

if search_type == 'bayesian_optimization':

    # this is just a naive initial guess at what will work
    # note that the log-uniform prior helps to focus on 
    # smaller values
    search_spaces = dict(
        # Number of trees in random forest
        n_estimators = Integer(10,2000, prior='log-uniform'),
        # Number of features to consider at every split
        max_features = Categorical(['log2', 'sqrt']),
        # Maximum number of levels in tree
        max_depth = Integer(1, 200,  prior='log-uniform'),
        # Minimum number of samples required at each leaf node
        min_samples_leaf = Real(1e-4, 1e-2, prior='log-uniform'),
        # Method of selecting samples for training each tree
        bootstrap = Categorical([True, False])
    )

    estimator = RandomForestClassifier()
    # log-uniform: understand as search over p = exp(x) by varying x
    param_search = BayesSearchCV(
        estimator, search_spaces,
        cv=group_kfold, verbose=2, random_state=42, n_iter=n_iter,
        n_jobs = -1)

    # executes bayesian optimization
    _ = param_search.fit(X=designmtx, y=condition_data, groups=group_assignments)


Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
[CV] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=0.0004281531928076346, n_estimators=348; total time= 1.8min
[CV] END b

Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
[CV] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=0.0004281531928076346, n_estimators=348; total time= 1.8min
[CV] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=0.0004281531928076346, n_estimators=348; total time= 1.9min
[CV] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=0.0004281531928076346, n_estimators=348; total time= 1.7min
[CV] END bootstrap=True, max_depth=108, max_features=log2, min_samples_leaf=0.007988179462781242, n_estimators=974; total time= 1.6min
[CV] END bootstrap=True, max_depth=108, max_features=log2, min_samples_leaf=0.0079881794627

Fitting 11 folds for each of 1 candidates, totalling 11 fits
[CV] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=0.0004281531928076346, n_estimators=348; total time= 1.8min
[CV] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=0.0004281531928076346, n_estimators=348; total time= 1.9min
[CV] END bootstrap=False, max_depth=47, max_features=sqrt, min_samples_leaf=0.0004281531928076346, n_estimators=348; total time= 1.8min
[CV] END bootstrap=True, max_depth=108, max_features=log2, min_samples_leaf=0.007988179462781242, n_estimators=974; total time= 1.6min
[CV] END bootstrap=True, max_depth=108, max_features=log2, min_samples_leaf=0.007988179462781242, n_estimators=974; total time= 1.6min
[CV] END bootstrap=True, max_depth=108, max_features=log2, min_samples_leaf=0.007988179462781242, n_estimators=974; total time= 1.4min
[CV] END bootstrap=False, max_depth=130, max_features=log2, min_samples_leaf=0.0007356404539935663, n_estimators=27; total tim

Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
[CV] END bootstrap=True, max_depth=200, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=10; total time=   2.2s
[CV] END bootstrap=True, max_depth=200, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=10; total time=   2.7s
[CV] END bootstrap=False, max_depth=37, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=1699; total time=10.0min
[CV] END bootstrap=False, ma

Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
[CV] END bootstrap=False, max_depth=200, max_features=sqrt, min_samples_leaf=0.0023221124149211104, n_estimators=1954; total time= 9.9min
[CV] END bootstrap=False, max_depth=200, max_features=sqrt, min_samples_leaf=0.0023221124149211104, n_estimators=1954; total time= 9.7min
[CV] END bootstrap=False, max_depth=200, max_features=sqrt, min_samples_leaf=0.0023221124149211104, n_estimators=1954; total time= 8.0min
[CV] END bootstrap=False, max_depth=10, max_features=log2, min_samples_leaf=0.009828231894078289, n_estimators=1565; total time= 3.5min
[CV] END bootstrap=False, max_depth=10, max_features=log2, min_samples_leaf=0.009828231894078289, n_estimators=1565; total time= 3.7min
[CV] END bootstrap=False, max_depth=10, max_features=log2, min_samples_leaf=0.009828231894078289, n_estimators=1565; total time= 3.2min
[CV] END bootstrap=False, max_depth=10, max_features=sqr

Fitting 11 folds for each of 1 candidates, totalling 11 fits
[CV] END bootstrap=False, max_depth=10, max_features=log2, min_samples_leaf=0.009828231894078289, n_estimators=1565; total time= 3.5min
[CV] END bootstrap=False, max_depth=10, max_features=log2, min_samples_leaf=0.009828231894078289, n_estimators=1565; total time= 3.6min
[CV] END bootstrap=False, max_depth=10, max_features=log2, min_samples_leaf=0.009828231894078289, n_estimators=1565; total time= 3.3min
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=0.005842563302866546, n_estimators=1640; total time= 6.7min
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=0.005842563302866546, n_estimators=1640; total time= 6.8min
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=0.005842563302866546, n_estimators=1640; total time= 5.8min
[CV] END bootstrap=True, max_depth=42, max_features=sqrt, min_samples_leaf=0.00010136608061102067, n_estimators=1450; total

Fitting 11 folds for each of 1 candidates, totalling 11 fits
[CV] END bootstrap=True, max_depth=37, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 6.6min
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 7.2min
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 7.5min
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits
Fitting 11 folds for each of 1 candidates, totalling 11 fits


## Saving and outputing results

In [3]:
import pandas as pd
result_df = pd.DataFrame(param_search.cv_results_)
result_df.insert(0, 'model', str(estimator))
result_df.insert(1, 'held out', held_out)
result_df.insert(2, 'balanced', is_balanced)
result_df.insert(3, 'n_splits', param_search.get_params()['cv'].get_n_splits())
result_df.insert(4, 'feature set', str(feature_types))
display(result_df)
save_results_df_to_file(result_df, f'{search_type}_{name}')

AttributeError: 'str' object has no attribute 'cv_results_'

In [14]:
_ = output_model_best_from_results(result_df)

RandomForestClassifier(): max_test_score= 0.2650423567735877, max_std_test_score= 0.14003557522134547
best params: OrderedDict([('bootstrap', True), ('max_depth', 31), ('max_features', 'sqrt'), ('min_samples_leaf', 0.00032828990339886245), ('n_estimators', 15)])



In [15]:
feature_types

{'Correlation',
 'Energy',
 'FreqKurtosis',
 'FreqSkewness',
 'Hurst',
 'IQR',
 'LempelZivEntropy',
 'LyapunovExponent',
 'MAD',
 'Max',
 'MaxFreqInd',
 'Mean',
 'MeanFreq',
 'Min',
 'SD',
 'arCoeff'}

[CV] END bootstrap=False, max_depth=1, max_features=log2, min_samples_leaf=0.00010402038686341951, n_estimators=1932; total time=  46.1s
[CV] END bootstrap=False, max_depth=1, max_features=log2, min_samples_leaf=0.00010402038686341951, n_estimators=1932; total time=  41.7s
[CV] END bootstrap=True, max_depth=37, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 7.7min
[CV] END bootstrap=True, max_depth=37, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 8.0min
[CV] END bootstrap=True, max_depth=37, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 6.2min
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 7.0min
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 7.3min
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 6.2min


[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 7.1min
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 7.0min
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=0.0001, n_estimators=2000; total time= 6.5min
[CV] END bootstrap=False, max_depth=195, max_features=log2, min_samples_leaf=0.00010055974895170814, n_estimators=111; total time=  22.2s
[CV] END bootstrap=False, max_depth=195, max_features=log2, min_samples_leaf=0.00010055974895170814, n_estimators=111; total time=  22.8s
[CV] END bootstrap=False, max_depth=195, max_features=log2, min_samples_leaf=0.00010055974895170814, n_estimators=111; total time=  21.9s
[CV] END bootstrap=False, max_depth=184, max_features=sqrt, min_samples_leaf=0.009739721622459256, n_estimators=23; total time=   5.5s
[CV] END bootstrap=False, max_depth=184, max_features=sqrt, min_samples_leaf=0.009739721

In [17]:
list(feature_types)

['MeanFreq',
 'SD',
 'Min',
 'MAD',
 'LempelZivEntropy',
 'MaxFreqInd',
 'Energy',
 'Mean',
 'Max',
 'Correlation',
 'LyapunovExponent',
 'FreqSkewness',
 'FreqKurtosis',
 'Hurst',
 'IQR',
 'arCoeff']

In [18]:
param_search


In [21]:
param_search.get_params()

{'cv': GroupKFold(n_splits=11),
 'error_score': 'raise',
 'estimator__bootstrap': True,
 'estimator__ccp_alpha': 0.0,
 'estimator__class_weight': None,
 'estimator__criterion': 'gini',
 'estimator__max_depth': None,
 'estimator__max_features': 'sqrt',
 'estimator__max_leaf_nodes': None,
 'estimator__max_samples': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__monotonic_cst': None,
 'estimator__n_estimators': 100,
 'estimator__n_jobs': None,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': RandomForestClassifier(),
 'fit_params': None,
 'iid': 'deprecated',
 'n_iter': 50,
 'n_jobs': -1,
 'n_points': 1,
 'optimizer_kwargs': None,
 'pre_dispatch': '2*n_jobs',
 'random_state': 42,
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'search_spaces': {'n_estimato

11

In [27]:
is_balanced = True
