In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import scipy as sp
import scipy.stats
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools
import re
res_digit = r'[0-9]'

# fourier transform
from scipy.fft import fft, ifft

from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE



In [2]:
# This is a hack to make the library in the parent folder available for imoprts
# A better solution is by np8 here:
# https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
import sys
import os
import inspect

thisdir = sys.path[0]
print(f"thisdir = {thisdir}")
parentdir = os.path.dirname(thisdir)
#print(f"parentdir = {parentdir}")
if not parentdir in sys.path:
    print("Adding parent directory to python path")
    sys.path.insert(1, parentdir)
else:
    print("Skipping adding parent direct to path (there already)")

print(f"sys.path =\n{sys.path}")



thisdir = /home/luke/git/external/predicament/notebooks
Adding parent directory to python path
sys.path =
['/home/luke/git/external/predicament/notebooks', '/home/luke/git/external/predicament', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/luke/.local/lib/python3.10/site-packages', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/lib/python3.10/dist-packages']


In [3]:
## ensure relative path to data directory is sound
# for the notebook we need to modify the BASE_DATA_FOLDER
import os 
os.environ['PREDICAMENT_DATA_DIR'] =  '../data'

from predicament.utils.config import FEATURED_BASE_PATH

In [4]:
from predicament.data.timeseries import create_participant_data_edf_only
from predicament.data.windowed import window_all_participants_data
from predicament.data.windowed import merge_condition_data
from predicament.data.partitioning import between_subject_cv_partition

from predicament.data.features import MAXIMAL_FEATURE_GROUP
from predicament.data.features import STATS_FEATURE_GROUP
from predicament.data.features import INFO_FEATURE_GROUP
from predicament.data.features import FREQ_FEATURE_GROUP
from predicament.data.features import convert_timeseries_to_features
from prepare_evaluation_data import load_dataframe_and_config


from predicament.data.datasets import propose_balanced_subject_condition_counts
from predicament.data.datasets import subsample_proposed_subject_condition_counts
from predicament.data.datasets import get_subject_condition_counts

from predicament.evaluation.results import output_model_best_from_results
from predicament.evaluation.results import save_results_df_to_file


In [5]:
from predicament.evaluation.results import test
from predicament.evaluation.results import save_results_df_to_file


## Load features

Before running this, you will need to generate featured data. See README file for details. For the variable `subdir` below replace this with the subdirectory name of the featured data.

In [6]:
subdir = '20231129210920'
featured_data_dir = os.path.join(FEATURED_BASE_PATH,subdir)

featured_df, featured_config = load_dataframe_and_config(
    featured_data_dir, 'featured.csv')
n_channels = int(featured_config['LOAD']['n_channels'])
channels = json.loads(featured_config['LOAD']['channels'].replace("'",'"'))
participant_list = json.loads(featured_config['LOAD']['participant_list'].replace("'",'"'))
Fs = int(featured_config['LOAD']['sample_rate'])
window_size = int(featured_config['LOAD']['window_size'])
time = window_size/Fs
print(f"Fs: {Fs}, n_samples = {window_size}, time: {time}s, n_channels: {n_channels}")


Fs: 250, n_samples = 1024, time: 4.096s, n_channels: 5


In [7]:
featured_df.columns

Index(['part_ID', 'condition', 'start time', 'Mean0', 'Mean1', 'Mean2',
       'Mean3', 'Mean4', 'SD0', 'SD1', 'SD2', 'SD3', 'SD4', 'MAD0', 'MAD1',
       'MAD2', 'MAD3', 'MAD4', 'Max0', 'Max1', 'Max2', 'Max3', 'Max4', 'Min0',
       'Min1', 'Min2', 'Min3', 'Min4', 'Energy0', 'Energy1', 'Energy2',
       'Energy3', 'Energy4', 'IQR0', 'IQR1', 'IQR2', 'IQR3', 'IQR4',
       'Correlation0', 'Correlation1', 'Correlation2', 'Correlation3',
       'Correlation4', 'Correlation5', 'Correlation6', 'Correlation7',
       'Correlation8', 'Correlation9', 'arCoeff0', 'arCoeff1', 'arCoeff2',
       'arCoeff3', 'arCoeff4', 'arCoeff5', 'arCoeff6', 'arCoeff7', 'arCoeff8',
       'arCoeff9', 'arCoeff10', 'arCoeff11', 'arCoeff12', 'arCoeff13',
       'arCoeff14', 'arCoeff15', 'arCoeff16', 'arCoeff17', 'arCoeff18',
       'arCoeff19', 'LyapunovExponent0', 'LyapunovExponent1',
       'LyapunovExponent2', 'LyapunovExponent3', 'LyapunovExponent4',
       'MaxFreqInd0', 'MaxFreqInd1', 'MaxFreqInd2', 'MaxFreqI

In [8]:
# balancing the data
proposed_subject_condition_counts = propose_balanced_subject_condition_counts(
    featured_df)
print(f"proposed_subject_condition_counts = {proposed_subject_condition_counts}")
proposed_condition_counts = np.sum(proposed_subject_condition_counts, axis=0)
print(f"proposed_condition_counts = {proposed_condition_counts}")
proposed_class_balance = proposed_condition_counts/np.sum(proposed_condition_counts)
print(f"proposed_class_balance = {proposed_class_balance}")
print(f"balancing featured data...")
featured_df = subsample_proposed_subject_condition_counts(
    featured_df, proposed_subject_condition_counts)
# post-check
subject_condition_counts = get_subject_condition_counts(featured_df)
print(f"after balancing: subject_condition_counts = {subject_condition_counts}")

proposed_subject_condition_counts = [[304 322 461 283 296]
 [304 322 461 283 296]
 [304 110 344 283 296]
 [304 227 461 283 296]
 [304 322 344 227 296]
 [  0 110   0 283 296]
 [304 322   0 283 296]
 [304 322 461 283  86]
 [304 322   0 283 296]
 [304 322   0 283 296]
 [304 322 344 283 296]
 [304 322 461 283 296]]
proposed_condition_counts = [3344 3345 3337 3340 3342]
proposed_class_balance = [0.20014364 0.2002035  0.19972468 0.19990424 0.20002394]
balancing featured data...
after balancing: subject_condition_counts = [[304. 322. 461. 283. 296.]
 [304. 322. 461. 283. 296.]
 [304. 110. 344. 283. 296.]
 [304. 227. 461. 283. 296.]
 [304. 322. 344. 227. 296.]
 [  0. 110.   0. 283. 296.]
 [304. 322.   0. 283. 296.]
 [304. 322. 461. 283.  86.]
 [304. 322.   0. 283. 296.]
 [304. 322.   0. 283. 296.]
 [304. 322. 344. 283. 296.]
 [304. 322. 461. 283. 296.]]


## Classification

In [9]:
import pandas as pd

from sklearn.model_selection import RandomizedSearchCV 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer


In [10]:
# model name
name = 'GradientBoosting'

# choose which search to perform
# search_type = 'random_search'
search_type = 'bayesian_optimization'


# you can choose a subset of the feature types to use
features_to_use = set(
    ['Mean', 'SD', 'MAD', 'Max', 'Min', 'Energy', 'IQR',
     'arCoeff', 'Correlation', 'MaxFreqInd', 'MeanFreq', 'FreqSkewness',
    'FreqKurtosis'
    ])
# this constructs a list of the featured data columns based on the above feature types
columns_to_use = [ col for col in featured_df.columns if re.sub(res_digit, '', col) in features_to_use ]
print(f"columns_to_use = {columns_to_use}")

designmtx = featured_df[columns_to_use].to_numpy()

condition_data = featured_df['condition'].values.astype(int)

columns_to_use = ['Mean0', 'Mean1', 'Mean2', 'Mean3', 'Mean4', 'SD0', 'SD1', 'SD2', 'SD3', 'SD4', 'MAD0', 'MAD1', 'MAD2', 'MAD3', 'MAD4', 'Max0', 'Max1', 'Max2', 'Max3', 'Max4', 'Min0', 'Min1', 'Min2', 'Min3', 'Min4', 'Energy0', 'Energy1', 'Energy2', 'Energy3', 'Energy4', 'IQR0', 'IQR1', 'IQR2', 'IQR3', 'IQR4', 'Correlation0', 'Correlation1', 'Correlation2', 'Correlation3', 'Correlation4', 'Correlation5', 'Correlation6', 'Correlation7', 'Correlation8', 'Correlation9', 'arCoeff0', 'arCoeff1', 'arCoeff2', 'arCoeff3', 'arCoeff4', 'arCoeff5', 'arCoeff6', 'arCoeff7', 'arCoeff8', 'arCoeff9', 'arCoeff10', 'arCoeff11', 'arCoeff12', 'arCoeff13', 'arCoeff14', 'arCoeff15', 'arCoeff16', 'arCoeff17', 'arCoeff18', 'arCoeff19', 'MaxFreqInd0', 'MaxFreqInd1', 'MaxFreqInd2', 'MaxFreqInd3', 'MaxFreqInd4', 'MeanFreq0', 'MeanFreq1', 'MeanFreq2', 'MeanFreq3', 'MeanFreq4', 'FreqSkewness0', 'FreqSkewness1', 'FreqSkewness2', 'FreqSkewness3', 'FreqSkewness4', 'FreqKurtosis0', 'FreqKurtosis1', 'FreqKurtosis2', '

## Hold one group out

In [11]:
# standard cross-validation
# Match all digits in the string and replace them with an empty string
# new_string = re.sub(pattern, '', string1)
subjects = np.unique(featured_df['part_ID'])
n_subjects = len(subjects)
groups = np.empty(len(featured_df), dtype=int)
for s, sub in enumerate(subjects):
    groups[featured_df['part_ID']==sub] = s
# cross validation splits    
group_kfold = GroupKFold(n_splits=n_subjects)
# number of iterations for your search
n_iter = 10

## Random Search

Here we use a random grid to search for best hyperparameters


In [12]:
if search_type == 'random_search':
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False] 
    # Create the random grid
    random_grid = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap}

    # First create the base model to tune
    estimator = RandomForestClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    param_search = RandomizedSearchCV(
        estimator = estimator,
        param_distributions = random_grid,
        n_iter = n_iter,
        cv=group_kfold,
        verbose=2,
        random_state=42,
        n_jobs = -1)

    # Fit the random search model
    _ = param_search.fit(X=designmtx, y=condition_data, groups=groups)


## Bayesian Optimisation
May be quicker and more effective than random search

In [14]:
# Bayesian optimisation

if search_type == 'bayesian_optimization':

    # this is just a naive initial guess at what will work
    # note that the log-uniform prior helps to focus on 
    # smaller values
    search_spaces = dict(
        learning_rate = Real(1e-6, 5e-1, prior='log-uniform'),
        n_estimators = Integer(10,200, prior='log-uniform'),
        subsample = Real(1e-10, 1, prior='uniform'),
#         criterion = Categorical(['friedman_mse', 'squared_error']),
        #min_weight_fraction_leaf=0.0,
        max_depth = Integer(1, 100,  prior='log-uniform'),
        # min_impurity_decrease=0.0,
        max_features = Real(1e-1, 1, prior='log-uniform'),
        # max_leaf_nodes=None, 
        #ccp_alpha = 
    )

    estimator = GradientBoostingClassifier()
    # log-uniform: understand as search over p = exp(x) by varying x
    param_search = BayesSearchCV(
        estimator, search_spaces,
        cv=group_kfold, verbose=3, random_state=42, n_iter=n_iter,
        n_jobs = -1)

    # executes bayesian optimization
    _ = param_search.fit(X=designmtx, y=condition_data, groups=groups)


Fitting 12 folds for each of 1 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed: 15.5min remaining:  3.1min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 15.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 12 folds for each of 1 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed: 68.4min remaining: 13.7min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 69.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 12 folds for each of 1 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  2.4min remaining:   28.9s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 12 folds for each of 1 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  2.7min remaining:   32.9s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 12 folds for each of 1 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  6.2min remaining:  1.2min


KeyboardInterrupt: 

In [15]:
result_df = pd.DataFrame(param_search.cv_results_)
result_df.insert(0, 'model', name)
result_df.insert(1, 'held out', 'subject')
result_df.insert(2, 'feature set', str(features_to_use))
display(result_df)
save_results_df_to_file(result_df, f'{search_type}_{name}')

AttributeError: 'BayesSearchCV' object has no attribute 'cv_results_'

In [None]:
output_model_best_from_results(result_df)

In [None]:
estimator.get_params()