In [1]:
import numpy as np
import os
import os.path as osp
from DatasetLoader.dataset_loader import DatasetLoader
from DatasetLoader.signal_processing import *
from sklearn.impute import SimpleImputer
from collections import defaultdict
from typing import Tuple
from Classifiers.classifiers import Classifier
from collections import Counter
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

In [2]:
feature_labels = ['mean_scl', 'std_scl', 'std_scr', 'corr', 
                'num_responses', 'sum_scr_response_duration', 'sum_scr_amplitude', 'area_of_response_curve',
                'num_scr_peaks', 'mean_eda', 'std_eda', 'min_eda', 'max_eda', 'eda_dynamic_range',
                'mean_scr', 'max_scr', 'min_scr', 'kurtosis_scr', 'skewness_scr', 'mean_first_grad', 'std_first_grad', 'mean_second_grad', 'std_second_grad', 
                'mean_peaks', 'max_peaks', 'min_peaks', 'std_peaks', 'mean_onsets', 'max_onsets', 'min_onsets', 'std_onsets',
                'ALSC', 'INSC', 'APSC', 'RMSC']

In [91]:
def preprocess_data(flatten_agg_data: np.array, flatten_agg_ground_truth: np.array, flatten_agg_group: np.array = None, sampling_rate: int = 5) -> Tuple[ np.array, np.array, np.array ]:
    # Remove the cut with less than 50 values
    # filtered_agg_index = [item_index for item_index, microsiemens in enumerate(flatten_agg_data) if len(microsiemens) >= 250]
    filtered_agg_index = [item_index for item_index, microsiemens in enumerate(flatten_agg_data) if len(microsiemens) >= 150]
    flatten_agg_data = flatten_agg_data[filtered_agg_index]
    flatten_agg_ground_truth = flatten_agg_ground_truth[filtered_agg_index]
    if flatten_agg_group is not None:
        flatten_agg_group = flatten_agg_group[filtered_agg_index]

    # Fill the missing values by constant = 0 (due to device error during data collection process)
    imp_constant = SimpleImputer(strategy='constant', fill_value=0)
    imputed_agg_data = np.array([imp_constant.fit_transform(microsiemens.reshape(-1, 1)).flatten() for microsiemens in flatten_agg_data])

    return imputed_agg_data, flatten_agg_ground_truth, flatten_agg_group

In [22]:
def extract_features(preprocessed_data: np.array, sampling_rate: int = 5) -> np.array:
    # Extract features from the data
    # print([len(microsiemens) for microsiemens in preprocessed_data])
    processed_features = [extract_gsr_features(microsiemens, sampling_rate=sampling_rate) for microsiemens in preprocessed_data]

    # Extract statistic features
    statistic_features = np.array([statistics_gsr_signal_features(feat) for feat in processed_features])
    return statistic_features

# Collected GSR Data

In [56]:
dataset_loader = DatasetLoader()
collected_gsr_data, ground_truth = dataset_loader.load_collected_gsr_dataset()

In [57]:
single_signal_data = select_single_signal(collected_gsr_data, 1)

In [58]:
interval_gsr_data, interval_ground_truth, interval_group = dataset_loader.divide_into_intervals(single_signal_data, ground_truth, 60, sampling_rate = 5)

### One-minute interval split


In [59]:
flatten_agg_data = dataset_loader.flatten(interval_gsr_data)
flatten_agg_ground_truth = dataset_loader.flatten(interval_ground_truth)
flatten_agg_group = dataset_loader.flatten(interval_group)
print(f"Number of subjects: {len(list(set(flatten_agg_group)))}")
print(f"Number of samples: {len(flatten_agg_data)}")

Number of subjects: 11
Number of samples: 640


In [60]:
preprocessed_data, preprocessed_ground_truth, preprocessed_groups = preprocess_data(flatten_agg_data, flatten_agg_ground_truth, flatten_agg_group=flatten_agg_group)
statistic_features = extract_features(preprocessed_data)

## General Cross-population Classifier

### Binary Classification

In [61]:
binary_ground_truth = np.array([0 if value[0] < 2 else 1 for value in preprocessed_ground_truth])
X = statistic_features
y = binary_ground_truth

In [62]:
clf = Classifier(X, y, groups = preprocessed_groups, logo_validation = True, feature_labels = feature_labels)
# test_groups, f1_scores = clf.logistic_regression()
# test_groups, f1_scores = clf.random_forest_classifier()
# test_groups, f1_scores = clf.support_vector_machine()
# test_groups, f1_scores = clf.multilayer_perceptron()
# test_groups, f1_scores = clf.knn_classifier()
dataset_loader.class_percentage_analysis(y)
# clf.dump_csv(test_groups, f1_scores, 'DCU-NVT-EXP1-General-KNN.csv')

0: 457
1: 140


## Person-specific stress detection model

### Binary classification

In [31]:
# Prepare personal-specific stress detection model
person_specific_dataset = defaultdict(dict)
person_specific_ground_truth = defaultdict(dict)
for participant_id, data in single_signal_data.items():
    person_specific_data, person_specific_gt = dataset_loader.prepare_person_specific_dataset(data, ground_truth)
    person_specific_dataset[participant_id] = person_specific_data
    person_specific_ground_truth[participant_id] = person_specific_gt

In [32]:
f1_scores = []
precision_scores = []
test_groups = []
for participant_id, data in person_specific_dataset.items():
    print(f"---- Participant id {participant_id} ----")
    person_specific_gt = np.array([0 if value[0] < 2 else 1 for value in person_specific_ground_truth[participant_id]])
    if len(list(set(person_specific_gt))) < 2: # If the data contains non-stress value, then continue
        continue
    test_groups.append(participant_id)
    person_specific_data, person_specific_gt = dataset_loader.divide_person_specific_data_into_intervals(data, person_specific_gt, num_samples = 60)
    preprocessed_data, preprocessed_ground_truth, _ = preprocess_data(person_specific_data, person_specific_gt)
    person_specific_statistic_features = extract_features(preprocessed_data)
    total_classes = len(preprocessed_ground_truth)
    class_percentage = np.array([num_items / total_classes for _, num_items in Counter(preprocessed_ground_truth).items()])
    balanced_degree_score = 1.0 * class_percentage.min() / class_percentage.max()
    print(class_percentage, balanced_degree_score)
    if balanced_degree_score < 0.5:
        print("Imbalanced")
        # clf = Classifier(person_specific_statistic_features, preprocessed_ground_truth, balanced_weight = 'balanced', cross_validation = True, feature_labels = feature_labels)
    else:
        print("Balanced")
    clf = Classifier(person_specific_statistic_features, preprocessed_ground_truth, cross_validation = True, feature_labels = feature_labels)
    # f1score, precision = clf.logistic_regression()
    # f1score, precision = clf.random_forest_classifier()
    # f1score, precision = clf.support_vector_machine()
    # f1score, precision = clf.multilayer_perceptron()
    f1score, precision = clf.knn_classifier()
    f1_scores.append(f1score)
    precision_scores.append(precision)
    print("---------------------------")
print("------------------------------------")
clf.dump_csv(test_groups, f1_scores, 'DCU-NVT-EXP1-Personal-KNN.csv')
mean_f1_score = np.mean(f1_scores)
print(f"Mean F1-score: {mean_f1_score}")
mean_precision_score = np.mean(precision_scores)
print(f"Mean precision score: {mean_precision_score}")

---- Participant id A ----
[0.61666667 0.38333333] 0.6216216216216216
Balanced
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best cv score: 0.7993265993265993
{'n_neighbors': 7, 'weights': 'distance'}
F1 Score: 0.5882352941176471
Precision Score: 0.625
Recall Score: 0.5555555555555556
---------------------------
---- Participant id B ----
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.0s finished
[0.69090909 0.30909091] 0.4473684210526316
Imbalanced
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best cv score: 0.6746031746031745
{'n_neighbors': 5, 'weights': 'uniform'}
F1 Score: 0.588235294117647
Precision Score: 0.5
Recall Score: 0.7142857142857143
---------------------------
---- Participant id C ----
---- Participant id D ----
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.0s finishe

# WESAD GSR Data Low-Sample Simulated

In [70]:
dataset_loader = DatasetLoader()
wesad_gsr_data, ground_truth = dataset_loader.load_wesad_gsr_dataset()

In [71]:
single_signal_data = select_single_signal(wesad_gsr_data, 0)
downsampling_wesad_gsr_data = resampling_data_signal(single_signal_data, sampling_rate = 700, desired_sampling_rate = 5, method = 'interpolation')

In [72]:
interval_gsr_data, interval_ground_truth, interval_group = dataset_loader.divide_into_intervals(downsampling_wesad_gsr_data, ground_truth, 60)

In [73]:
flatten_agg_data = dataset_loader.flatten(interval_gsr_data)
flatten_agg_ground_truth = dataset_loader.flatten(interval_ground_truth)
flatten_agg_group = dataset_loader.flatten(interval_group)
print(f"Number of subjects: {len(list(set(flatten_agg_group)))}")
print(f"Number of samples: {len(flatten_agg_data)}")

Number of subjects: 15
Number of samples: 75


In [74]:
preprocessed_data, preprocessed_ground_truth, preprocessed_groups = preprocess_data(flatten_agg_data, flatten_agg_ground_truth, flatten_agg_group=flatten_agg_group)
statistic_features = extract_features(preprocessed_data)

## General Cross-population Classifier

### Binary classification

In [75]:
binary_ground_truth = preprocessed_ground_truth.flatten()
X = statistic_features
y = binary_ground_truth

In [76]:
clf = Classifier(X, y, groups = preprocessed_groups, logo_validation = True, feature_labels = feature_labels)
# test_groups, f1_scores = clf.logistic_regression()
# test_groups, f1_scores = clf.random_forest_classifier()
# test_groups, f1_scores = clf.support_vector_machine()
# test_groups, f1_scores = clf.multilayer_perceptron()
# test_groups, f1_scores = clf.knn_classifier()
# clf.dump_csv(test_groups, f1_scores, 'WESAD-General-RF.csv')

### Person-specific stress detection

In [37]:
# Prepare personal-specific stress detection model
person_specific_dataset = defaultdict(dict)
person_specific_ground_truth = defaultdict(dict)
for participant_id, data in downsampling_wesad_gsr_data.items():
    person_specific_data, person_specific_gt = dataset_loader.prepare_person_specific_dataset(data, ground_truth)
    person_specific_dataset[participant_id] = person_specific_data
    person_specific_ground_truth[participant_id] = person_specific_gt

In [38]:
f1_scores = []
precision_scores = []
test_groups = []
for participant_id, data in person_specific_dataset.items():
    print(f"---- Participant id {participant_id} ----")
    person_specific_gt = person_specific_ground_truth[participant_id].flatten()
    if len(list(set(person_specific_gt))) < 2: # If the data contains non-stress value, then continue
        continue
    test_groups.append(participant_id)
    person_specific_data, person_specific_gt = dataset_loader.divide_person_specific_data_into_intervals(data, person_specific_gt, num_samples=60)
    preprocessed_data, preprocessed_ground_truth, _ = preprocess_data(person_specific_data, person_specific_gt)

    total_classes = len(preprocessed_ground_truth)
    class_percentage = np.array([num_items / total_classes for _, num_items in Counter(preprocessed_ground_truth).items()])
    balanced_degree_score = 1.0 * class_percentage.min() / class_percentage.max()
    print(class_percentage, balanced_degree_score)
    
    person_specific_statistic_features = extract_features(preprocessed_data)
    if balanced_degree_score < 0.5:
        print("Imbalanced")
        # clf = Classifier(person_specific_statistic_features, preprocessed_ground_truth, balanced_weight = 'balanced', cross_validation = True, feature_labels = feature_labels)
    else:
        print("Balanced")
    clf = Classifier(person_specific_statistic_features, preprocessed_ground_truth, cross_validation = True, feature_labels = feature_labels)
    # f1score, precision = clf.logistic_regression()
    f1score, precision = clf.random_forest_classifier()
    # f1score, precision = clf.decision_tree()
    # f1score, precision = clf.support_vector_machine()
    # f1score, precision = clf.multilayer_perceptron()
    # f1score, precision = clf.knn_classifier()
    # f1score, precision = clf.extra_tree_classifier()
    # f1score, precision = clf.lda_classifier()
    # f1score, precision = clf.logistic_regression_recursive_feature_selection()
    f1_scores.append(f1score)
    precision_scores.append(precision)
    print("---------------------------")
print("------------------------------------")
clf.dump_csv(test_groups, f1_scores, 'WESAD-Personal-RF.csv')
mean_f1_score = np.mean(f1_scores)
print(f"Mean F1-score: {mean_f1_score}")
mean_precision_score = np.mean(precision_scores)
print(f"Mean precision score: {mean_precision_score}")

---- Participant id S10 ----
[0.75510204 0.24489796] 0.32432432432432434
Imbalanced
Fitting 3 folds for each of 144 candidates, totalling 432 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:   37.5s finished
Best cv score: 0.9333333333333332
{'class_weight': None, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
F1 Score: 0.888888888888889
Precision Score: 1.0
Recall Score: 0.8
---------------------------
---- Participant id S11 ----
[0.77083333 0.22916667] 0.29729729729729726
Imbalanced
Fitting 3 folds for each of 144 candidates, totalling 432 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   13.9

# WESAD GSR Data

In [92]:
dataset_loader = DatasetLoader()
wesad_gsr_data, ground_truth = dataset_loader.load_wesad_gsr_dataset()

In [93]:
single_signal_data = select_single_signal(wesad_gsr_data, 0)

In [94]:
# interval_gsr_data, interval_ground_truth, interval_group = dataset_loader.divide_into_intervals(single_signal_data, ground_truth, 60, sampling_rate = 700)
interval_gsr_data, interval_ground_truth, interval_group = dataset_loader.divide_into_intervals(single_signal_data, ground_truth, 60, sampling_rate = 4)

### One-minute interval split

In [95]:
flatten_agg_data = dataset_loader.flatten(interval_gsr_data)
flatten_agg_ground_truth = dataset_loader.flatten(interval_ground_truth)
flatten_agg_group = dataset_loader.flatten(interval_group)
print(f"Number of subjects: {len(list(set(flatten_agg_group)))}")
print(f"Number of samples: {len(flatten_agg_data)}")

Number of subjects: 15
Number of samples: 788


In [96]:
# preprocessed_ground_truth = flatten_agg_ground_truth
# preprocessed_groups = flatten_agg_group
# statistic_features = extract_features(flatten_agg_data, sampling_rate = 700)
preprocessed_data, preprocessed_ground_truth, preprocessed_groups = preprocess_data(flatten_agg_data, flatten_agg_ground_truth, flatten_agg_group=flatten_agg_group)
statistic_features = extract_features(preprocessed_data, sampling_rate = 4)

## General Cross-population Classifier

### Binary Classification

In [97]:
binary_ground_truth = preprocessed_ground_truth.flatten()
X = statistic_features
y = binary_ground_truth

In [98]:
clf = Classifier(X, y, groups = preprocessed_groups, logo_validation = True, feature_labels = feature_labels)
# test_groups, f1_scores = clf.logistic_regression()
# test_groups, f1_scores = clf.random_forest_classifier()
# test_groups, f1_scores = clf.support_vector_machine()
# test_groups, f1_scores = clf.multilayer_perceptron()
# test_groups, f1_scores = clf.knn_classifier()
dataset_loader.class_percentage_analysis(y)
# clf.dump_csv(test_groups, f1_scores, 'WESAD-General-RF.csv')

0: 571
1: 165


### Person-specific stress detection

In [49]:
# Prepare personal-specific stress detection model
person_specific_dataset = defaultdict(dict)
person_specific_ground_truth = defaultdict(dict)
for participant_id, data in single_signal_data.items():
    person_specific_data, person_specific_gt = dataset_loader.prepare_person_specific_dataset(data, ground_truth)
    person_specific_dataset[participant_id] = person_specific_data
    person_specific_ground_truth[participant_id] = person_specific_gt

In [50]:
f1_scores = []
precision_scores = []
test_groups = []
for participant_id, data in person_specific_dataset.items():
    print(f"---- Participant id {participant_id} ----")
    person_specific_gt = person_specific_ground_truth[participant_id].flatten()
    if len(list(set(person_specific_gt))) < 2: # If the data contains non-stress value, then continue
        continue
    test_groups.append(participant_id)
    # person_specific_data, person_specific_gt = dataset_loader.divide_person_specific_data_into_intervals(data, person_specific_gt, num_samples=60, sampling_rate = 700)
    # person_specific_statistic_features = extract_features(person_specific_data, sampling_rate = 700)
    person_specific_data, person_specific_gt = dataset_loader.divide_person_specific_data_into_intervals(data, person_specific_gt, num_samples=60, sampling_rate = 4)
    # person_specific_statistic_features = extract_features(person_specific_data, sampling_rate = 4)
    # total_classes = len(person_specific_gt)
    # class_percentage = np.array([num_items / total_classes for _, num_items in Counter(person_specific_gt).items()])
    
    # WRIST-WORN DEVICE ONLY
    preprocessed_data, preprocessed_ground_truth, _ = preprocess_data(person_specific_data, person_specific_gt) 
    person_specific_statistic_features = extract_features(preprocessed_data, sampling_rate = 4)
    total_classes = len(preprocessed_ground_truth)
    class_percentage = np.array([num_items / total_classes for _, num_items in Counter(preprocessed_ground_truth).items()])

    #  --------
    balanced_degree_score = 1.0 * class_percentage.min() / class_percentage.max()
    print(class_percentage, balanced_degree_score)
    if balanced_degree_score < 0.5:
        print("Imbalanced")
        # clf = Classifier(person_specific_statistic_features, person_specific_gt, balanced_weight = 'balanced', cross_validation = True, feature_labels = feature_labels)
    else:
        print("Balanced")
    # clf = Classifier(person_specific_statistic_features, person_specific_gt, cross_validation = True, feature_labels = feature_labels)
    clf = Classifier(person_specific_statistic_features, preprocessed_ground_truth, cross_validation = True, feature_labels = feature_labels)
    # f1score, precision = clf.logistic_regression()
    f1score, precision = clf.random_forest_classifier()
    # f1score, precision = clf.support_vector_machine()
    # f1score, precision = clf.multilayer_perceptron()
    # f1score, precision = clf.knn_classifier()
    f1_scores.append(f1score)
    precision_scores.append(precision)
    print("---------------------------")
print("------------------------------------")
clf.dump_csv(test_groups, f1_scores, 'WESAD-Personal-RF.csv')
mean_f1_score = np.mean(f1_scores)
print(f"Mean F1-score: {mean_f1_score}")
mean_precision_score = np.mean(precision_scores)
print(f"Mean precision score: {mean_precision_score}")

---- Participant id S10 ----
[0.76923077 0.23076923] 0.3
Imbalanced
Fitting 3 folds for each of 144 candidates, totalling 432 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:   33.4s finished
Best cv score: 1.0
{'class_weight': None, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
F1 Score: 0.8000000000000002
Precision Score: 0.8
Recall Score: 0.8
---------------------------
---- Participant id S11 ----
[0.7755102 0.2244898] 0.2894736842105263
Imbalanced
Fitting 3 folds for each of 144 candidates, totalling 432 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 432