In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import scipy as sp
import scipy.stats
from tqdm import tqdm
import itertools
import re
res_digit = r'[0-9]'

from importlib import reload

from sklearn.metrics import classification_report

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn import preprocessing 


from sklearn.model_selection import RandomizedSearchCV 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer


In [2]:
# This is a hack to make the library in the parent folder available for imoprts
# A better solution is by np8 here:
# https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
import sys
import os
import inspect

thisdir = sys.path[0]
print(f"thisdir = {thisdir}")
parentdir = os.path.dirname(thisdir)
#print(f"parentdir = {parentdir}")
if not parentdir in sys.path:
    print("Adding parent directory to python path")
    sys.path.insert(1, parentdir)
else:
    print("Skipping adding parent direct to path (there already)")

print(f"sys.path =\n{sys.path}")



thisdir = /home/luke/git/external/predicament/notebooks
Adding parent directory to python path
sys.path =
['/home/luke/git/external/predicament/notebooks', '/home/luke/git/external/predicament', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/luke/.local/lib/python3.10/site-packages', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/lib/python3.10/dist-packages']


In [3]:
## ensure relative path to data directory is sound
# for the notebook we need to modify the BASE_DATA_FOLDER
import os 
os.environ['PREDICAMENT_DATA_DIR'] =  '../data'



In [4]:
from predicament.utils.file_utils import load_dataframe_and_config
import predicament.utils.config_parser
reload(predicament.utils.config_parser)
from predicament.utils.config_parser import config_to_dict

from predicament.utils.config import FEATURED_BASE_PATH
from predicament.data.features import IDEAL_FEATURE_GROUP

from predicament.evaluation.balancing import get_group_label_counts
from predicament.evaluation.balancing import balance_data
from predicament.evaluation.grouping import get_group_assignments
from predicament.evaluation.staging import get_design_matrix
from predicament.evaluation.results import output_model_best_from_results
from predicament.evaluation.results import save_results_df_to_file

from predicament.evaluation.hyperparameters import get_param_scopes
from predicament.evaluation.hyperparameters import get_param_search_object

from predicament.models.mlp_wrappers import ThreeHiddenLayerClassifier


In [5]:
# high level choices
subdir = 'binary_dreem_4secs' # dataset
held_out = 'participant'
is_balanced = True # balance data set
use_only_ideal_features = True # restrict to preferred ideal features
standardise_data = False
max_iter_opt = 200
n_iter = 50 # number of iterations for your search
new_search = True # restarts the search
random_state = 43
use_callback = True

## Load featured data and balance if required

Before running this, you will need to generate featured data. See README file for details. For the variable `subdir` above replace this with the subdirectory name of the featured data.

In [6]:
featured_data_dir = os.path.join(FEATURED_BASE_PATH,subdir)

featured_df, featured_config = load_dataframe_and_config(
    featured_data_dir, 'featured.csv')



In [7]:
n_channels = featured_config['LOAD']['n_channels']
data_format = featured_config['LOAD']['data_format']
channels = featured_config['LOAD']['channels']
participant_list = featured_config['LOAD']['participant_list']
sample_rate = featured_config['LOAD']['sample_rate']
Fs = sample_rate
window_size = featured_config['LOAD']['window_size']
time = window_size/sample_rate
print(f"sample_rate: {sample_rate}, n_samples = {window_size}, time: {time}s, n_channels: {n_channels}")


sample_rate: 250, n_samples = 1024, time: 4.096s, n_channels: 4


In [8]:
if is_balanced:
    # balance featured data
    subject_condition_counts = get_group_label_counts(featured_df, 'participant', 'condition')
    print(f"before balancing: subject_condition_counts = {subject_condition_counts}")
    featured_df = balance_data(featured_df, group_col='participant', label_col='condition')
    subject_condition_counts = get_group_label_counts(featured_df, 'participant', 'condition')

    print(f"after balancing: subject_condition_counts = {subject_condition_counts}")

before balancing: subject_condition_counts = [[2657.    0.]
 [2638.    0.]
 [2072.  110.]
 [2423.  805.]
 [2423.  344.]
 [1501.  110.]
 [2122.  461.]
 [2282.  688.]
 [2079.  110.]
 [2027.  285.]
 [2540.  344.]
 [2540.  805.]]
after balancing: subject_condition_counts = [[339.   0.]
 [339.   0.]
 [339. 110.]
 [339. 805.]
 [339. 344.]
 [339. 110.]
 [339. 461.]
 [339. 688.]
 [339. 110.]
 [339. 285.]
 [339. 344.]
 [339. 805.]]


## Define model and hyperparamer search

In [9]:
overrides = dict()
excludes = list()
# the base model to tune
estimator = SVC()
#estimator = GradientBoostingClassifier()
#estimator = RandomForestClassifier()
# max_iter_opt = 1
#estimator = MLPClassifier(max_iter=max_iter_opt)
# estimator = ThreeHiddenLayerClassifier()
#excludes = ['layer3'] # for 2 (hidden) layer MLP (leave empty for 3 layer MLP)
# excludes = ['layer2', 'layer3'] # for 1 (hidden) layer MLP
print(f"estimator = {estimator}")

# search_type = 'random_search'
search_type = 'bayesian_optimization'

# now create the parameter search object and run the hyperparameter search
param_scopes = get_param_scopes(
    search_type, estimator, excludes=excludes, **overrides)
print(f"param_scopes = {param_scopes}")


estimator = SVC()
param_scopes = {'C': Real(low=0.1, high=10000.0, prior='log-uniform', transform='identity'), 'kernel': Categorical(categories=('rbf', 'sigmoid'), prior=None), 'gamma': Real(low=1e-09, high=1, prior='log-uniform', transform='identity'), 'coef0': Real(low=-1, high=1, prior='uniform', transform='identity'), 'shrinking': Categorical(categories=(True, False), prior=None)}


## Define data properties and data-split 

In [10]:
feature_set = featured_config['FEATURED']['feature_set']
if use_only_ideal_features:
    feature_set = list(IDEAL_FEATURE_GROUP.intersection(feature_set))
    
print(f"feature_set = {feature_set}")

# extract input data
# use all features in file
feature_types, feature_names, designmtx = get_design_matrix(
    featured_df, feature_set)
# extract labels
labels = featured_df['condition'].values.astype(int)

if standardise_data:
    scaler = preprocessing.StandardScaler().fit(designmtx)
    designmtx = scaler.transform(designmtx)
    
# prepare Hold one group out cross validation
held_out, groups, group_assignments = get_group_assignments(featured_df)
n_groups = len(groups)
# cross validation splits    
group_kfold = GroupKFold(n_splits=n_groups)

feature_set = ['Correlation', 'FreqKurtosis', 'SampleEntropy', 'SD', 'Mean', 'Hurst', 'IQR', 'Max', 'LyapunovExponent', 'MaxFreqInd', 'LempelZivComplexity', 'MeanFreq', 'arCoeff', 'MAD', 'Min']


## Define and Execute hyperparameter search strategy 

In [None]:
# fix to avoid error in BayesSearchCV.fit
import numpy as np
np.int = int

def create_callback_and_storage(param_search):
    intermediate_results = []
    # Define a custom callback function to store intermediate results
    def on_step(optim_result):
        # Store the current state of the optimization process
        intermediate_results.append((optim_result.func_vals, optim_result.x_iters))

        # Print out the best score and best parameters found so far
        best_score = -optim_result.fun
        print("Best score: %s" % best_score)
        print("Best parameters: %s" % optim_result.x)
    return on_step, intermediate_results


param_search  = get_param_search_object(
    search_type, estimator, param_scopes=param_scopes, 
    n_iter = n_iter, cv=group_kfold,
    verbose=2, random_state=random_state, n_jobs=-1,
    fit_params={'X': designmtx, 'y': labels, 'callbacks': None},
    refit=False  # Ensure that the search does not refit the model with the best parameters found so far
)

if use_callback:
    if new_search:
        on_step, intermediate_results = create_callback_and_storage(param_search)
    else:
        param_search.fit_params['search_results'] = intermediate_results
    try:
        # Fit the random search model
        _ = param_search.fit(X=designmtx, y=labels, groups=group_assignments, callback=on_step)
    except Exception as e:
        print("Exception occurred:", str(e))
    finally:
        # Print or process intermediate results even if an error occurs
        print("Intermediate Results:")
        for i, (scores, params) in enumerate(intermediate_results):
            print(f"Iteration {i + 1}: Scores - {scores}, Params - {params}")
        new_search = False
        random_state = np.random.randint(100000)
else:
    _ = param_search.fit(X=designmtx, y=labels, groups=group_assignments)


Fitting 12 folds for each of 1 candidates, totalling 12 fits
Best score: 0.3598576477794529
Best parameters: [40.33985072818989, -0.493355159370347, 1.0837982698716211e-07, 'rbf', False]
Fitting 12 folds for each of 1 candidates, totalling 12 fits
Best score: 0.3598576477794529
Best parameters: [40.33985072818989, -0.493355159370347, 1.0837982698716211e-07, 'rbf', False]
Fitting 12 folds for each of 1 candidates, totalling 12 fits
Best score: 0.3598576477794529
Best parameters: [40.33985072818989, -0.493355159370347, 1.0837982698716211e-07, 'rbf', False]
Fitting 12 folds for each of 1 candidates, totalling 12 fits
Best score: 0.3598576477794529
Best parameters: [40.33985072818989, -0.493355159370347, 1.0837982698716211e-07, 'rbf', False]
Fitting 12 folds for each of 1 candidates, totalling 12 fits
Best score: 0.4279802955698744
Best parameters: [2.4206178382891803, 0.08349600882982644, 3.415679480068009e-05, 'rbf', False]
Fitting 12 folds for each of 1 candidates, totalling 12 fits
Bes

In [None]:
param_search.get_params()

## Saving and outputing results

In [None]:
result_df = pd.DataFrame(param_search.cv_results_)
i = 0
result_df.insert(i, 'model', str(estimator))
i +=1
result_df.insert(i, 'data format', data_format)
i +=1
result_df.insert(i, 'held out', held_out)
i +=1
result_df.insert(i, 'balanced', is_balanced)
i +=1
result_df.insert(i, 'n_splits', param_search.get_params()['cv'].get_n_splits())
i +=1
result_df.insert(i, 'feature set', str(feature_types))
i +=1
result_df.insert(i, 'window size', window_size)
display(result_df)
results_fname = f'{search_type}_{str(estimator)}'
print(f"Saving to {results_fname}")
save_results_df_to_file(result_df, results_fname)

In [None]:
_ = output_model_best_from_results(result_df)


In [None]:
label_cols = featured_config['WINDOWED']['label_cols']
feature_types = list(feature_set)
feature_types.sort()
print(f"# Feature Set:\n{feature_types}")
derived_feature_names = []
derived_feature_types = set([])
for f in featured_df.columns:
    if f in label_cols:
        continue
    elif (f[-1] == ']'):
        if (f[:-1].rstrip('0123456789')[-1] == '['):
            f = f[:-1].rstrip('0123456789')[:-1]
    else:
        f = f.rstrip('0123456789')
    for type_ in feature_set:
        if f.startswith(type_):
            derived_feature_types.add(f)
            break
derived_feature_types = list(derived_feature_types)
derived_feature_types.sort()
output = ';'.join(derived_feature_types)
print(f"Derived Feature Types:\n{output}")

In [None]:
thresholds = np.linspace(0,0.7,51)
N = len(result_df)
props = np.empty(thresholds.size)
for t, thresh in enumerate(thresholds):
    count = np.sum(result_df['mean_test_score'] > thresh)
    props[t] = count/N
plt.plot(thresholds, props)
plt.xlabel("mean test score")
plt.ylabel("proportion greater than")

In [None]:
result_df.columns

In [None]:
display_thresh = 0.6
display_cols = [col for col in result_df.columns if col.startswith('param') or (col =='mean_test_score') or (col == 'std_test_score')]
result_df[result_df['mean_test_score'] >= display_thresh][display_cols]

In [None]:
# or just show all rows
result_df[display_cols]