In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import scipy as sp
import scipy.stats
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools
import re
res_digit = r'[0-9]'

# fourier transform
from scipy.fft import fft, ifft

from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.manifold import TSNE



In [None]:
# This is a hack to make the library in the parent folder available for imoprts
# A better solution is by np8 here:
# https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
import sys
import os
import inspect

thisdir = sys.path[0]
print(f"thisdir = {thisdir}")
parentdir = os.path.dirname(thisdir)
#print(f"parentdir = {parentdir}")
if not parentdir in sys.path:
    print("Adding parent directory to python path")
    sys.path.insert(1, parentdir)
else:
    print("Skipping adding parent direct to path (there already)")

print(f"sys.path =\n{sys.path}")



In [None]:
## ensure relative path to data directory is sound
# for the notebook we need to modify the BASE_DATA_FOLDER
import os 
os.environ['PREDICAMENT_DATA_DIR'] =  '../data'

from predicament.utils.config import DREEM_EEG_CHANNELS

In [None]:
from predicament.data.timeseries import create_participant_data_edf_only
from predicament.data.windowed import window_all_participants_data
from predicament.data.windowed import merge_condition_data
from predicament.data.partitioning import between_subject_cv_partition

from predicament.data.features import MAXIMAL_FEATURE_GROUP
from predicament.data.features import STATS_FEATURE_GROUP
from predicament.data.features import INFO_FEATURE_GROUP
from predicament.data.features import FREQ_FEATURE_GROUP
from predicament.data.features import convert_timeseries_to_features
from prepare_evaluation_data import load_dataframe_and_config


## Load featured data

In [None]:
featured_df, featured_config = load_dataframe_and_config(
    '../data/featured/20231206193533/', 'featured.csv')
data_format = featured_config['LOAD']['data_format']
print(f"data_format: {data_format}")
n_channels = int(featured_config['LOAD']['n_channels'])
channels = json.loads(featured_config['LOAD']['channels'].replace("'",'"'))
participant_list = json.loads(featured_config['LOAD']['participant_list'].replace("'",'"'))
Fs = int(featured_config['LOAD']['sample_rate'])
window_size = int(featured_config['LOAD']['window_size'])
window_step = int(featured_config['LOAD']['window_step'])
time = window_size/Fs
print(f"Fs: {Fs}, n_samples = {window_size}, time: {time}s, n_channels: {n_channels}")
window_overlap_factor = window_size//window_step
print(f"window_size: {window_size}, window_step: {window_step}, window_overlap_factor: {window_overlap_factor}")

In [None]:
featured_df.columns

In [None]:
features_to_use = set(
    ['Mean', 'SD', 'MAD', 'Max', 'Min',# 'SMA',
      'Energy', 'IQR', # 'Entropy',
     'arCoeff', 'Correlation', 'MaxFreqInd', 'MeanFreq', 'FreqSkewness',
    'FreqKurtosis' # , 'EnergyBands'
    ])
columns_to_use = [ col for col in featured_df.columns if re.sub(res_digit, '', col) in features_to_use ]
print(f"columns_to_use = {columns_to_use}")

designmtx = featured_df[columns_to_use].values 
condition_data = featured_df['condition'].values.astype(int)
subject_data_names = featured_df['participant']

design2d = TSNE(n_components=2, init='random', perplexity=3).fit_transform(designmtx)
print(f"design2d.shape = {design2d.shape}")

In [None]:
subjects = np.unique(subject_data_names)
subject_data = np.empty(subject_data_names.shape, dtype=int)
for s, sub in enumerate(subjects):
    subject_data[subject_data_names==sub] = s
print(f"designmtx.shape = {designmtx.shape}")
print(f"condition_data.shape = {condition_data.shape}")
print(f"subject_data.shape = {subject_data.shape}")
    
conditions = np.unique(condition_data)
markers = ['v', '^', '<', '>', 's', '*', '+' , 'x', 'D', '.']
colours = ['b','g','r','y','k']
cmap = plt.cm.rainbow
norm = colors.BoundaryNorm(np.arange(np.min(subject_data)-0.5,np.max(subject_data)+0.5), cmap.N)

plt.scatter(
    design2d[:,0], design2d[:,1], c=subject_data, norm=norm, s=0.5, edgecolor='none')
plt.colorbar(
    ticks=np.arange(subjects.size))
plt.title("2d TSNE data coloured by subject id")


plt.figure()
cmap = plt.cm.rainbow
norm = colors.BoundaryNorm(np.arange(np.min(condition_data)-0.5,np.max(condition_data)+0.5), cmap.N)

plt.scatter(
    design2d[:,0], design2d[:,1], c=condition_data, norm=norm, s=0.5, edgecolor='none')
plt.colorbar(
    ticks=np.arange(conditions.size))
plt.title("2d TSNE data coloured by condition id")
        

## Classification

In [None]:
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import GroupKFold

models = {
    "Random Forest": RandomForestClassifier(
        min_samples_leaf=5, random_state=0
    ),
    "Gradient Boosting": GradientBoostingClassifier(
        max_leaf_nodes=15, random_state=0
    ),
    "MLP":  MLPClassifier(max_iter=100)
}
param_grids = {
    "Random Forest": {"n_estimators": [10, 20, 50, 100]},
    "Gradient Boosting": {"n_estimators": [10, 20, 50, 100]},
    "MLP": {
        'hidden_layer_sizes': [(10,),(20,),(50,),(100,)],
        'activation': ['relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05],
        'learning_rate': ['constant','adaptive'],
    }
}



In [None]:
# # standard cross-validation
# # Match all digits in the string and replace them with an empty string
# # new_string = re.sub(pattern, '', string1)
# results_df = None

# features_to_use = set(
#     ['Mean', 'SD', 'MAD', 'Max', 'Min',# 'SMA',
#       'Energy', 'IQR', # 'Entropy',
#      'arCoeff', 'Correlation', 'MaxFreqInd', 'MeanFreq', 'FreqSkewness',
#     'FreqKurtosis' # , 'EnergyBands'
#     ])
# columns_to_use = [ col for col in featured_df.columns if re.sub(res_digit, '', col) in features_to_use ]
# print(f"columns_to_use = {columns_to_use}")

# designmtx = featured_df[columns_to_use].values 
# # condition_data = featured_df['condition'].values.astype(int)
# # subject_data = featured_df['subject'].values.astype(int)

# cv = KFold(n_splits=5, shuffle=True, random_state=0)
# results = []
# for name, model in models.items():
#     grid_search = GridSearchCV(
#         estimator=model,
#         param_grid=param_grids[name],
#         return_train_score=True,
#         cv=cv,
#     ).fit(designmtx, condition_data)
#     result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_)}
#     results.append(result)

In [None]:
# print(f"results[0]['mean_test_score'] ={results[0]['mean_test_score']}")
# print(f"results[1]['mean_test_score'] ={results[1]['mean_test_score']}")


## Hold one group out

In [None]:
# standard cross-validation
# Match all digits in the string and replace them with an empty string
# new_string = re.sub(pattern, '', string1)
subjects = np.unique(featured_df['participant'])
n_subjects = len(subjects)
groups = np.empty(len(featured_df), dtype=int)
for s, sub in enumerate(subjects):
    groups[featured_df['participant']==sub] = s
    
features_to_use = set(
    ['Mean', 'SD', 'MAD', 'Max', 'Min',# 'SMA',
      'Energy', 'IQR', # 'Entropy',
     'arCoeff', 'Correlation', 'MaxFreqInd', 'MeanFreq', 'FreqSkewness',
    'FreqKurtosis' # , 'EnergyBands'
    ])
columns_to_use = [ col for col in featured_df.columns if re.sub(res_digit, '', col) in features_to_use ]
print(f"columns_to_use = {columns_to_use}")

designmtx = featured_df[columns_to_use].values 

In [None]:
results_df = pd.DataFrame()
group_kfold = GroupKFold(n_splits=n_subjects)
results = []
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=group_kfold,
    ).fit(X=designmtx, y=condition_data, groups=groups)
    result_df = pd.DataFrame(grid_search.cv_results_)
    #test_df['model'] = result['model']
    result_df.insert(0, 'model', name)
    result_df.insert(1, 'held out', 'subject')
    result_df.insert(2, 'feature set', str(features_to_use))
    display(result_df)
    results_df = pd.concat((results_df, result_df))
    #result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_)}
    #results.append(result)

In [None]:
results_df

In [None]:
import datetime
nowstr = datetime.datetime.now().replace(microsecond=0).isoformat()
results_df.to_csv(f'../data/results/{nowstr}_results.csv')

In [None]:
for name, model in models.items():
    print(name)

In [None]:
results_df[results_df['model'] == 'MLP']['mean_test_score'].max()

In [None]:
results_df[results_df['model'] == 'Random Forest']['mean_test_score'].max()

In [None]:
results_df[results_df['model'] == 'Gradient Boosting']['mean_test_score'].max()

In [None]:
d = dict(results_df[results_df['mean_test_score'] == 0.3252334560211044]['params'])

In [None]:
d[62]

## Held out subject-conditions

In [None]:
subject_conditions = [(s,c) for s in subjects for c in conditions]
subject_conditions
featured_df['subject_conditions'] = None
for i, (s, c) in enumerate(subject_conditions):
    featured_df['subject_conditions'][(featured_df['participant'] == s) & (featured_df['condition'] == c)] = i
featured_df['subject_conditions']

In [None]:
featured_df.columns

In [None]:
sc_groups = featured_df['subject_conditions'].to_numpy()
sc_results_df = pd.DataFrame()
sc_group_kfold = GroupKFold(n_splits=n_subjects)
for name, model in models.items():
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        return_train_score=True,
        cv=sc_group_kfold,
    ).fit(X=designmtx, y=condition_data, groups=sc_groups)
    result_df = pd.DataFrame(grid_search.cv_results_)
    #test_df['model'] = result['model']
    result_df.insert(0, 'model', name)
    result_df.insert(1, 'held out', 'subject_condition')
    result_df.insert(2, 'feature set', str(features_to_use))
    display(result_df)
    sc_results_df = pd.concat((sc_results_df, result_df))
    #result = {"model": name, "cv_results": pd.DataFrame(grid_search.cv_results_)}
    #results.append(result)

In [None]:
import datetime
nowstr = datetime.datetime.now().replace(microsecond=0).isoformat()
results_df.to_csv(f'../data/results/{nowstr}_results.csv')

In [None]:
for model in np.unique(sc_results_df['model']):
    model_max_test_score = sc_results_df[sc_results_df['model'] == model]['mean_test_score'].max()
    print(f"{model}: max_test_score= {model_max_test_score}")
    d = sc_results_df[sc_results_df['mean_test_score'] == model_max_test_score]['params']
    for k,v in d.items():
        model_best_params = v
        print(f"best params: {v}")
    print()

## Held out subject-condition-phases

In [None]:
for participant in subjects:
    for condition in conditions:
        _filter = (featured_df['condition'] == condition) & (featured_df['participant'] == participant)
        sc_featured_df = featured_df[_filter]
        min_time_index = sc_featured_df['start time'].min() 
        max_time_index = sc_featured_df['start time'].max()
        mid_time_index = (min_time_index+max_time_index)//2
        margin = window_overlap_factor//2
        if (margin*2) < window_overlap_factor:
            margin += 1
        phase1_start_index = min_time_index
        phase1_end_index = mid_time_index-margin
        phase2_start_index = mid_time_index+margin
        phase2_end_index = max_time_index
        print(f"participant: {participant}, condition: {condition}")
        print(f"min: {min_time_index}, max: {max_time_index}, mid: {mid_time_index}")
        if phase1_start_index < phase1_end_index:
            print(f"phase1_start_index: {phase1_start_index}, phase1_end_index: {phase1_end_index}")
            print(f"phase2_start_index: {phase2_start_index}, phase2_end_index: {phase2_end_index}")
        print()

In [None]:
tot = len(featured_df)
for c in np.unique(featured_df['condition']):
    count = len(featured_df[featured_df['condition'] == c])
    print(f"count = {count}")
    print(f"{c} : {count}/{tot} = {count/tot}")