In [1]:
import numpy as np
import os
import os.path as osp
from DatasetLoader.dataset_loader import DatasetLoader
from DatasetLoader.signal_processing import *
from sklearn.impute import SimpleImputer
from Classifiers.classifier_strategy import TryMLClassifierStrategy
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from collections import defaultdict
from typing import Tuple

In [2]:
dataset_loader = DatasetLoader()
collected_gsr_data, ground_truth = dataset_loader.load_collected_gsr_dataset()

In [3]:
single_signal_data = select_single_signal(collected_gsr_data, 1)

In [4]:
# # Aggrgate dataset to seconds
# agg_gsr_data = dataset_loader.aggregate_gsr_dataset(collected_gsr_data, ['MICROSIEMENS', 'SCR', 'SCR/MIN'])
# Divide the data into intervals
# agg_interval_gsr_data, agg_interval_ground_truth, agg_interval_group = dataset_loader.divide_into_intervals(agg_gsr_data, ground_truth, 60, sampling_rate = 5)
interval_gsr_data, interval_ground_truth, interval_group = dataset_loader.divide_into_intervals(single_signal_data, ground_truth, 60, sampling_rate = 5)

## General cross-population stress detection model

### One-minute interval split

In [5]:
flatten_agg_data = dataset_loader.flatten(interval_gsr_data)
flatten_agg_ground_truth = dataset_loader.flatten(interval_ground_truth)
flatten_agg_group = dataset_loader.flatten(interval_group)
print(f"Number of subjects: {len(list(set(flatten_agg_group)))}")
print(f"Number of samples: {len(flatten_agg_data)}")

Number of subjects: 11
Number of samples: 640


In [6]:
def preprocess_data(flatten_agg_data: np.array, flatten_agg_ground_truth: np.array, flatten_agg_group: np.array = None, sampling_rate: int = 5) -> Tuple[ np.array, np.array, np.array ]:
    # Remove the cut with less than 50 values
    filtered_agg_index = [item_index for item_index, microsiemens in enumerate(flatten_agg_data) if len(microsiemens) >= 250]
    flatten_agg_data = flatten_agg_data[filtered_agg_index]
    flatten_agg_ground_truth = flatten_agg_ground_truth[filtered_agg_index]
    if flatten_agg_group is not None:
        flatten_agg_group = flatten_agg_group[filtered_agg_index]

    # Fill the missing values by constant = 0 (due to device error during data collection process)
    imp_constant = SimpleImputer(strategy='constant', fill_value=0)
    imputed_agg_data = np.array([imp_constant.fit_transform(microsiemens.reshape(-1, 1)).flatten() for microsiemens in flatten_agg_data])

    # # Normalize the features before processing
    # scaler = MinMaxScaler(feature_range=(-1, 1))
    # scaled_agg_data = np.array([scaler.fit_transform(microsiemens.reshape(-1, 1)).flatten() for microsiemens in imputed_agg_data])
    # return scaled_agg_data, flatten_agg_ground_truth, flatten_agg_group
    return imputed_agg_data, flatten_agg_ground_truth, flatten_agg_group



In [7]:
def extract_features(preprocessed_data: np.array, sampling_rate: int = 5) -> np.array:
    # Extract features from the data
    processed_features = [extract_gsr_features(microsiemens, sampling_rate=sampling_rate) for microsiemens in preprocessed_data]

    # Extract statistic features
    statistic_features = np.array([statistics_gsr_signal_features(feat) for feat in processed_features])
    return statistic_features

In [8]:
preprocessed_data, preprocessed_ground_truth, preprocessed_groups = preprocess_data(flatten_agg_data, flatten_agg_ground_truth, flatten_agg_group=flatten_agg_group)
statistic_features = extract_features(preprocessed_data)

### Binary Classification

In [9]:
binary_ground_truth = np.array([0 if value[0] < 2 else 1 for value in preprocessed_ground_truth])

In [10]:
X = statistic_features
y = binary_ground_truth

In [11]:
try_ml_strategies = TryMLClassifierStrategy(X, y, groups=preprocessed_groups)
try_ml_strategies.try_different_strategies(group_validation = True)

Try Logistic Regression...
Groups: [['G', 'D'], ['I', 'A', 'K'], ['H', 'C', 'F'], ['B', 'E', 'J']]
ROC AUC Score: [0.9203778677462887, 0.19182467372568335, 0.7678132678132679, 0.5425]
Mean accuracy of train set: 0.8227092401101919
Mean accuracy of test set: 0.7519100736363125
--------------------------------------
Try Random Forests...
Groups: [['G', 'D'], ['I', 'A', 'K'], ['H', 'C', 'F'], ['B', 'E', 'J']]
ROC AUC Score: [0.7977395411605938, 0.7240827382418124, 0.6851965601965602, 0.6637500000000001]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.7609079184638988
5 features remain
------After ensemble feature selection------
Groups: [['G', 'D'], ['I', 'A', 'K'], ['H', 'C', 'F'], ['B', 'E', 'J']]
ROC AUC Score: [0.5737179487179488, 0.868505294262497, 0.42199017199017197, 0.6323333333333334]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.7243981531683494
--------------------------------------
Try Extra Trees Classifier...
Groups: [['G', 'D'], ['I', 'A', 'K'], 

### Assumed binary ground-truth

In [12]:
assumed_binary_ground_truth = np.array([value[-1] for value in preprocessed_ground_truth])

In [15]:
X = statistic_features
y = assumed_binary_ground_truth

In [16]:
try_ml_strategies = TryMLClassifierStrategy(X, y, groups=preprocessed_groups)
try_ml_strategies.try_different_strategies(group_validation = True)

Try Logistic Regression...
Groups: [['G', 'D'], ['I', 'A', 'K'], ['H', 'C', 'F'], ['B', 'E', 'J']]
ROC AUC Score: [0.45781249999999996, 0.5075757575757576, 0.5492805755395683, 0.3786666666666666]
Mean accuracy of train set: 0.8408084034111594
Mean accuracy of test set: 0.8363735421335453
--------------------------------------
Try Random Forests...
Groups: [['G', 'D'], ['I', 'A', 'K'], ['H', 'C', 'F'], ['B', 'E', 'J']]
ROC AUC Score: [0.2877604166666667, 0.4813131313131313, 0.5262589928057554, 0.4355555555555556]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.8211442595002288
20 features remain
------After ensemble feature selection------
Groups: [['G', 'D'], ['I', 'A', 'K'], ['H', 'C', 'F'], ['B', 'E', 'J']]
ROC AUC Score: [0.2989583333333333, 0.40820707070707074, 0.5145683453237411, 0.45333333333333337]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.8124776726547627
--------------------------------------
Try Extra Trees Classifier...
Groups: [['G', 'D'], ['

### Detailed level-of-stress Classification

In [12]:
multi_label_ground_truth = np.array([value[0] for value in preprocessed_ground_truth])

In [13]:
X = statistic_features
y = multi_label_ground_truth

In [14]:
try_ml_strategies = TryMLClassifierStrategy(X, y, groups=preprocessed_groups, multiclass=True)
try_ml_strategies.try_different_strategies(group_validation = True)

Try Logistic Regression...
Groups: [['D', 'G'], ['I', 'A', 'K'], ['H', 'C', 'F'], ['E', 'J', 'B']]
Mean accuracy of train set: 0.5094459995733494
Mean accuracy of test set: 0.35382913343293415
--------------------------------------
Try Random Forests...
Groups: [['D', 'G'], ['I', 'A', 'K'], ['H', 'C', 'F'], ['E', 'J', 'B']]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.3626373652577974
10 features remain
------After ensemble feature selection------
Groups: [['D', 'G'], ['I', 'A', 'K'], ['H', 'C', 'F'], ['E', 'J', 'B']]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.36903044099456217
--------------------------------------
Try Extra Trees Classifier...
Groups: [['D', 'G'], ['I', 'A', 'K'], ['H', 'C', 'F'], ['E', 'J', 'B']]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.3585786078178591
8 features remain
------After ensemble feature selection------
Groups: [['D', 'G'], ['I', 'A', 'K'], ['H', 'C', 'F'], ['E', 'J', 'B']]
Mean accuracy of train set:

## Person-specific stress detection model

### Binary classification

In [15]:
# Prepare personal-specific stress detection model
person_specific_dataset = defaultdict(dict)
person_specific_ground_truth = defaultdict(dict)
for participant_id, data in single_signal_data.items():
    person_specific_data, person_specific_gt = dataset_loader.prepare_person_specific_dataset(data, ground_truth)
    person_specific_dataset[participant_id] = person_specific_data
    person_specific_ground_truth[participant_id] = person_specific_gt

In [16]:
for participant_id, data in person_specific_dataset.items():
    print(f"Processing {participant_id}")
    person_specific_gt = np.array([0 if value[0] < 2 else 1 for value in person_specific_ground_truth[participant_id]])
    if len(list(set(person_specific_gt))) < 2: # If the data contains non-stress value, then continue
        continue
    person_specific_data, person_specific_gt = dataset_loader.divide_person_specific_data_into_intervals(data, person_specific_gt, num_samples=60)
    preprocessed_data, preprocessed_ground_truth, _ = preprocess_data(person_specific_data, person_specific_gt)
    person_specific_statistic_features = extract_features(preprocessed_data)
    try_ml_strategies = TryMLClassifierStrategy(person_specific_statistic_features, preprocessed_ground_truth)
    try_ml_strategies.try_different_strategies(cross_validation = True)
    print("*****************************************")

Processing A
Try Logistic Regression...
ROC AUC Score: [0.7142857142857142, 0.6770833333333334, 0.5312499999999999]
Mean accuracy of train set: 0.8583333333333334
Mean accuracy of test set: 0.5833333333333334
1 features remain
------After recursive feature elimination------
ROC AUC Score: [0.989010989010989, 0.96875, 0.8072916666666667]
Mean accuracy of train set: 0.8083333333333332
Mean accuracy of test set: 0.8000000000000002
--------------------------------------
Try Random Forests...
ROC AUC Score: [0.8791208791208791, 0.90625, 0.734375]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.7333333333333334
7 features remain
------After ensemble feature selection------
ROC AUC Score: [0.956043956043956, 0.9270833333333334, 0.7291666666666667]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.8333333333333334
--------------------------------------
Try Extra Trees Classifier...
ROC AUC Score: [0.8406593406593407, 0.8645833333333334, 0.6302083333333334]
Mean accuracy

### Assumed binary ground-truth

In [None]:
for participant_id, data in person_specific_dataset.items():
    print(f"Processing {participant_id}")
    person_specific_gt = np.array([value[-1] for value in person_specific_ground_truth[participant_id]])
    if len(list(set(person_specific_gt))) < 2: # If the data contains non-stress value, then continue
        continue
    person_specific_data, person_specific_gt = dataset_loader.divide_person_specific_data_into_intervals(data, person_specific_gt, num_samples=60)
    preprocessed_data, preprocessed_ground_truth, _ = preprocess_data(person_specific_data, person_specific_gt)
    person_specific_statistic_features = extract_features(preprocessed_data)
    try_ml_strategies = TryMLClassifierStrategy(person_specific_statistic_features, preprocessed_ground_truth)
    try_ml_strategies.try_different_strategies(cross_validation = True)
    print("*****************************************")

In [17]:
for participant_id, data in person_specific_dataset.items():
    print(f"Processing {participant_id}")
    person_specific_gt = np.array([0 if value[0] < 2 else 1 for value in person_specific_ground_truth[participant_id]])
    dataset_loader.class_percentage_analysis(person_specific_gt)

Processing A
0: 0.6363636363636364
1: 0.36363636363636365
Processing B
0: 0.7272727272727273
1: 0.2727272727272727
Processing C
0: 1.0
Processing D
0: 0.45454545454545453
1: 0.5454545454545454
Processing E
0: 0.5454545454545454
1: 0.45454545454545453
Processing F
0: 0.8181818181818182
1: 0.18181818181818182
Processing G
0: 1.0
Processing H
0: 1.0
Processing I
0: 0.8181818181818182
1: 0.18181818181818182
Processing J
0: 0.5454545454545454
1: 0.45454545454545453
Processing K
0: 1.0


### Detailed level-of-stress Classification

In [19]:
for participant_id, data in person_specific_dataset.items():
    print(f"Processing {participant_id}")
    person_specific_gt = np.array([value[0] for value in person_specific_ground_truth[participant_id]])
    if len(list(set(person_specific_gt))) < 2: # If the data contains non-stress value, then continue
        continue
    person_specific_data, person_specific_gt = dataset_loader.divide_person_specific_data_into_intervals(data, person_specific_gt, num_samples=60)
    preprocessed_data, preprocessed_ground_truth, _ = preprocess_data(person_specific_data, person_specific_gt)
    person_specific_statistic_features = extract_features(preprocessed_data)
    try_ml_strategies = TryMLClassifierStrategy(person_specific_statistic_features, preprocessed_ground_truth, multiclass=True)
    try_ml_strategies.try_different_strategies(cross_validation = True)
    print("*****************************************")

Processing A
Try Logistic Regression...
Mean accuracy of train set: 0.7333333333333334
Mean accuracy of test set: 0.4666666666666666
2 features remain
------After recursive feature elimination------
Mean accuracy of train set: 0.6583333333333333
Mean accuracy of test set: 0.6833333333333332
--------------------------------------
Try Random Forests...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.65
10 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.65
--------------------------------------
Try Extra Trees Classifier...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.5499999999999999
10 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.65
--------------------------------------
Try SVM Classifier...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.5
--------------------------------------
Try MLPClassifier

In [19]:
for participant_id, data in person_specific_dataset.items():
    print(f"Processing {participant_id}")
    person_specific_gt = np.array([value[0] for value in person_specific_ground_truth[participant_id]])
    dataset_loader.class_percentage_analysis(person_specific_gt)

Processing A
0: 0.45454545454545453
1: 0.18181818181818182
2: 0.36363636363636365
Processing B
0: 0.45454545454545453
1: 0.2727272727272727
2: 0.18181818181818182
3: 0.09090909090909091
Processing C
0: 0.6
1: 0.4
Processing D
0: 0.18181818181818182
1: 0.2727272727272727
2: 0.45454545454545453
3: 0.09090909090909091
Processing E
0: 0.2727272727272727
1: 0.2727272727272727
3: 0.2727272727272727
2: 0.18181818181818182
Processing F
0: 0.45454545454545453
1: 0.36363636363636365
3: 0.09090909090909091
2: 0.09090909090909091
Processing G
0: 0.36363636363636365
1: 0.6363636363636364
Processing H
0: 0.5
1: 0.5
Processing I
0: 0.2727272727272727
1: 0.5454545454545454
2: 0.18181818181818182
Processing J
0: 0.45454545454545453
1: 0.09090909090909091
2: 0.45454545454545453
Processing K
0: 0.9090909090909091
1: 0.09090909090909091
