In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import scipy as sp
import scipy.stats
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools
import re
res_digit = r'[0-9]'

# fourier transform
from scipy.fft import fft, ifft

from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE



In [None]:
# This is a hack to make the library in the parent folder available for imoprts
# A better solution is by np8 here:
# https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
import sys
import os
import inspect

thisdir = sys.path[0]
print(f"thisdir = {thisdir}")
parentdir = os.path.dirname(thisdir)
#print(f"parentdir = {parentdir}")
if not parentdir in sys.path:
    print("Adding parent directory to python path")
    sys.path.insert(1, parentdir)
else:
    print("Skipping adding parent direct to path (there already)")

print(f"sys.path =\n{sys.path}")



In [None]:
## ensure relative path to data directory is sound
# for the notebook we need to modify the BASE_DATA_FOLDER
import os 
os.environ['PREDICAMENT_DATA_DIR'] =  '../data'

from predicament.utils.config import DREEM_EEG_CHANNELS

In [None]:
from predicament.data.timeseries import create_participant_data_edf_only
from predicament.data.windowed import window_all_participants_data
from predicament.data.windowed import merge_condition_data
from predicament.data.partitioning import between_subject_cv_partition

from predicament.data.features import MAXIMAL_FEATURE_GROUP
from predicament.data.features import STATS_FEATURE_GROUP
from predicament.data.features import INFO_FEATURE_GROUP
from predicament.data.features import FREQ_FEATURE_GROUP
from predicament.data.features import convert_timeseries_to_features
from prepare_evaluation_data import load_dataframe_and_config

from predicament.evaluation.balancing import get_group_label_counts
from predicament.evaluation.balancing import balance_data
# from predicament.data.datasets import propose_balanced_subject_condition_counts
# from predicament.data.datasets import subsample_proposed_subject_condition_counts
# from predicament.data.datasets import get_subject_condition_counts

from predicament.evaluation.results import output_model_best_from_results
from predicament.evaluation.results import save_results_df_to_file


## Load features


In [None]:
featured_df, featured_config = load_dataframe_and_config(
    '../data/featured/20231129210920/', 'featured.csv')
n_channels = int(featured_config['LOAD']['n_channels'])
channels = json.loads(featured_config['LOAD']['channels'].replace("'",'"'))
participant_list = json.loads(featured_config['LOAD']['participant_list'].replace("'",'"'))
Fs = int(featured_config['LOAD']['sample_rate'])
window_size = int(featured_config['LOAD']['window_size'])
time = window_size/Fs
print(f"Fs: {Fs}, n_samples = {window_size}, time: {time}s, n_channels: {n_channels}")


In [None]:
featured_df.columns

In [None]:
features_to_use = set(
    ['Mean', 'SD', 'MAD', 'Max', 'Min',# 'SMA',
      'Energy', 'IQR', # 'Entropy',
     'arCoeff', 'Correlation', 'MaxFreqInd', 'MeanFreq', 'FreqSkewness',
    'FreqKurtosis' # , 'EnergyBands'
    ])
columns_to_use = [ col for col in featured_df.columns if re.sub(res_digit, '', col) in features_to_use ]
print(f"columns_to_use = {columns_to_use}")

# designmtx = featured_df[columns_to_use].values 
# condition_data = featured_df['condition'].values.astype(int)
# subject_data_names = featured_df['participant']

# design2d = TSNE(n_components=2, init='random', perplexity=3).fit_transform(designmtx)
# print(f"design2d.shape = {design2d.shape}")

In [None]:
# balance featured data
subject_condition_counts = get_group_label_counts(featured_df, 'participant', 'condition')
print(f"before balancing: subject_condition_counts = {subject_condition_counts}")
featured_df = balance_data(featured_df, group_col='participant', label_col='condition')
subject_condition_counts = get_group_label_counts(featured_df, 'participant', 'condition')

print(f"after balancing: subject_condition_counts = {subject_condition_counts}")

## Visualising data

In [None]:
subjects = np.unique(subject_data_names)
subject_data = np.empty(subject_data_names.shape, dtype=int)
for s, sub in enumerate(subjects):
    subject_data[subject_data_names==sub] = s
print(f"designmtx.shape = {designmtx.shape}")
print(f"condition_data.shape = {condition_data.shape}")
print(f"subject_data.shape = {subject_data.shape}")
    
conditions = np.unique(condition_data)
markers = ['v', '^', '<', '>', 's', '*', '+' , 'x', 'D', '.']
colours = ['b','g','r','y','k']
cmap = plt.cm.rainbow
norm = colors.BoundaryNorm(np.arange(np.min(subject_data)-0.5,np.max(subject_data)+0.5), cmap.N)

plt.scatter(
    design2d[:,0], design2d[:,1], c=subject_data, norm=norm, s=0.5, edgecolor='none')
plt.colorbar(
    ticks=np.arange(subjects.size))


plt.figure()
cmap = plt.cm.rainbow
norm = colors.BoundaryNorm(np.arange(np.min(condition_data)-0.5,np.max(condition_data)+0.5), cmap.N)

plt.scatter(
    design2d[:,0], design2d[:,1], c=condition_data, norm=norm, s=0.5, edgecolor='none')
plt.colorbar(
    ticks=np.arange(conditions.size))
# for s,subject in enumerate(subjects):
#     marker = s
#     for c, condition in enumerate(conditions):
#         _filter = (condition_data==condition) &(subject_data==subject)
#         #print(f"condition = {condition}")
#         #print(f"np.sum(_filter) = {np.sum(_filter)}")
#         plt.plot(design2d[_filter,0], design2d[_filter,1], ls='None', marker=markers[s], color=colours[c], markeredgewidth=0.1)
        

## Classification

In [None]:
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import GroupKFold

models = {
    "Random Forest": RandomForestClassifier(
        min_samples_leaf=5, random_state=0
    ),
    "Gradient Boosting": GradientBoostingClassifier(
        max_leaf_nodes=15, random_state=0
    ),
    "MLP":  MLPClassifier(max_iter=100)
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100]},
    "Gradient Boosting": {"n_estimators": [10, 20, 50, 100]},
    "MLP": {
        'hidden_layer_sizes': [(10,),(20,),(50,),(100,)],
        'activation': ['relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05],
        'learning_rate': ['constant','adaptive'],
    }
}

# determine the columns to use
features_to_use = set(
    ['Mean', 'SD', 'MAD', 'Max', 'Min',# 'SMA',
      'Energy', 'IQR', # 'Entropy',
     'arCoeff', 'Correlation', 'MaxFreqInd', 'MeanFreq', 'FreqSkewness',
    'FreqKurtosis' # , 'EnergyBands'
    ])
columns_to_use = [ col for col in featured_df.columns if re.sub(res_digit, '', col) in features_to_use ]
print(f"columns_to_use = {columns_to_use}")

designmtx = featured_df[columns_to_use].to_numpy()

condition_data = featured_df['condition'].values.astype(int)


## Hold one group out

In [None]:
# standard cross-validation
# Match all digits in the string and replace them with an empty string
# new_string = re.sub(pattern, '', string1)
subjects = np.unique(featured_df['participant'])
n_subjects = len(subjects)
groups = np.empty(len(featured_df), dtype=int)
for s, sub in enumerate(subjects):
    groups[featured_df['participant']==sub] = s
# cross validation splits    
group_kfold = GroupKFold(n_splits=n_subjects)


In [None]:
results_df = pd.DataFrame()
results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=group_kfold,
    ).fit(X=designmtx, y=condition_data, groups=groups)
    result_df = pd.DataFrame(grid_search.cv_results_)
    #test_df['model'] = result['model']
    result_df.insert(0, 'model', name)
    result_df.insert(1, 'held out', 'subject')
    result_df.insert(2, 'feature set', str(features_to_use))
    display(result_df)
    results_df = pd.concat((results_df, result_df))
    #result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_)}
    #results.append(result)

In [None]:
from importlib import reload
import predicament.evaluation.results
reload(predicament.evaluation.results)
from predicament.evaluation.results import test
from predicament.evaluation.results import save_results_df_to_file
results_df

In [None]:
save_results_df_to_file(results_df, 'balanced_eeg')

In [None]:
output_model_best_from_results(results_df)

In [None]:
results_df

## Random Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV 
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = random_grid,
    n_iter = 100,
    cv=group_kfold,
    verbose=2,
    random_state=42,
    n_jobs = -1)


# Fit the random search model
rf_random.fit(X=designmtx, y=condition_data, groups=groups)
random_result_df = pd.DataFrame(rf_random.cv_results_)
save_results_df_to_file(random_result_df, 'random_search_random_forest')

In [None]:
# Bayesian optimisation

from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer

# from sklearn.datasets import load_iris
# from sklearn.svm import SVC
# from sklearn.model_selection import train_test_split

# X, y = load_iris(True)
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#      train_size=0.75,
#     random_state=0)

# estimator = SVC()
# search_spaces =      {
#          'C': Real(1e-6, 1e+6, prior='log-uniform'),
#          'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
#          'degree': Integer(1,8),
#          'kernel': Categorical(['linear', 'poly', 'rbf']),
#      }
estimator = MLPClassifier(max_iter=100)
#     "MLP": {
#         'hidden_layer_sizes': [(10,),(20,),(50,),(100,)],
#         'activation': ['relu'],
#         'solver': ['sgd', 'adam'],
#         'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05],
#         'learning_rate': ['constant','adaptive'],
#     }
search_spaces =      {
        'hidden_layer_sizes': Categorical([ (n,) for n in range(10,200,10)]),
        'activation': Categorical(['tanh', 'relu']),
        'solver': Categorical(['sgd', 'adam']),
        'alpha': Real(1e-6, 1e+1, prior='log-uniform'),
        'learning_rate': Categorical(['constant','adaptive']),
     }

# log-uniform: understand as search over p = exp(x) by varying x
param_search = BayesSearchCV(
    estimator, search_spaces,
    cv=group_kfold, verbose=2, random_state=42, n_iter=50)

# executes bayesian optimization
_ = param_search.fit(X=designmtx, y=condition_data, groups=groups)
result_df = pd.DataFrame(param_search.cv_results_)
result_df.insert(0, 'model', name)
result_df.insert(1, 'held out', 'subject')
result_df.insert(2, 'feature set', str(features_to_use))
display(result_df)
# results_df = pd.concat((results_df, result_df))

save_results_df_to_file(result_df, 'bayes_search')

# # model can be saved, used for predictions or scoring
# print(opt.score(X_test, y_test))

In [None]:
dict(result_df[result_df['mean_test_score'] == result_df['mean_test_score'].max()].loc[37])

# Balanced data 



In [None]:
subjects = np.unique(balanced_featured_df['participant'])
n_subjects = len(subjects)
groups = np.empty(len(balanced_featured_df), dtype=int)
for s, sub in enumerate(subjects):
    groups[balanced_featured_df['participant']==sub] = s
    
features_to_use = set(
    ['Mean', 'SD', 'MAD', 'Max', 'Min',# 'SMA',
      'Energy', 'IQR', # 'Entropy',
     'arCoeff', 'Correlation', 'MaxFreqInd', 'MeanFreq', 'FreqSkewness',
    'FreqKurtosis' # , 'EnergyBands'
    ])
columns_to_use = [ col for col in balanced_featured_df.columns if re.sub(res_digit, '', col) in features_to_use ]
print(f"columns_to_use = {columns_to_use}")

In [None]:
designmtx = balanced_featured_df[columns_to_use].values 
condition_data = balanced_featured_df['condition'].values.astype(int)

balanced_results_df = pd.DataFrame()
group_kfold = GroupKFold(n_splits=n_subjects)
results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=group_kfold,
    ).fit(X=designmtx, y=condition_data, groups=groups)
    result_df = pd.DataFrame(grid_search.cv_results_)
    #test_df['model'] = result['model']
    result_df.insert(0, 'model', name)
    result_df.insert(1, 'held out', 'subject')
    result_df.insert(2, 'feature set', str(features_to_use))
    display(result_df)
    balanced_results_df = pd.concat((balanced_results_df, result_df))
    #result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_)}
    #results.append(result)