In [1]:
import numpy as np
import os
import os.path as osp
from DatasetLoader.dataset_loader import DatasetLoader
from DatasetLoader.signal_processing import *
from sklearn.impute import SimpleImputer
from Classifiers.classifier_strategy import TryMLClassifierStrategy
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from collections import defaultdict
from typing import Tuple

SyntaxError: invalid syntax (dataset_loader.py, line 21)

In [2]:
dataset_loader = DatasetLoader()
collected_gsr_data, ground_truth = dataset_loader.load_collected_gsr_dataset()

In [3]:
single_signal_data = select_single_signal(collected_gsr_data, 1)

In [4]:
# # Aggrgate dataset to seconds
# agg_gsr_data = dataset_loader.aggregate_gsr_dataset(collected_gsr_data, ['MICROSIEMENS', 'SCR', 'SCR/MIN'])
# Divide the data into intervals
# agg_interval_gsr_data, agg_interval_ground_truth, agg_interval_group = dataset_loader.divide_into_intervals(agg_gsr_data, ground_truth, 60, sampling_rate = 5)
interval_gsr_data, interval_ground_truth, interval_group = dataset_loader.divide_into_intervals(single_signal_data, ground_truth, 60, sampling_rate = 5)

## General cross-population stress detection model

### One-minute interval split

In [5]:
flatten_agg_data = dataset_loader.flatten(interval_gsr_data)
flatten_agg_ground_truth = dataset_loader.flatten(interval_ground_truth)
flatten_agg_group = dataset_loader.flatten(interval_group)
print(f"Number of subjects: {len(list(set(flatten_agg_group)))}")
print(f"Number of samples: {len(flatten_agg_data)}")

Number of subjects: 11
Number of samples: 640


In [20]:
def preprocess_data(flatten_agg_data: np.array, flatten_agg_ground_truth: np.array, flatten_agg_group: np.array = None, sampling_rate: int = 5) -> Tuple[ np.array, np.array, np.array ]:
    # Remove the cut with less than 50 values
    filtered_agg_index = [item_index for item_index, microsiemens in enumerate(flatten_agg_data) if len(microsiemens) >= 250]
    flatten_agg_data = flatten_agg_data[filtered_agg_index]
    flatten_agg_ground_truth = flatten_agg_ground_truth[filtered_agg_index]
    if flatten_agg_group is not None:
        flatten_agg_group = flatten_agg_group[filtered_agg_index]

    # Fill the missing values by constant = 0 (due to device error during data collection process)
    imp_constant = SimpleImputer(strategy='constant', fill_value=0)
    imputed_agg_data = np.array([imp_constant.fit_transform(microsiemens.reshape(-1, 1)).flatten() for microsiemens in flatten_agg_data])

    # # Normalize the features before processing
    # scaler = MinMaxScaler(feature_range=(-1, 1))
    # scaled_agg_data = np.array([scaler.fit_transform(microsiemens.reshape(-1, 1)).flatten() for microsiemens in imputed_agg_data])
    # return scaled_agg_data, flatten_agg_ground_truth, flatten_agg_group
    return imputed_agg_data, flatten_agg_ground_truth, flatten_agg_group



In [7]:
def extract_features(preprocessed_data: np.array, sampling_rate: int = 5) -> np.array:
    # Extract features from the data
    processed_features = [extract_gsr_features(microsiemens, sampling_rate=sampling_rate) for microsiemens in preprocessed_data]

    # Extract statistic features
    statistic_features = np.array([statistics_gsr_signal_features(feat) for feat in processed_features])
    return statistic_features

In [8]:
preprocessed_data, preprocessed_ground_truth, preprocessed_groups = preprocess_data(flatten_agg_data, flatten_agg_ground_truth, flatten_agg_group=flatten_agg_group)
statistic_features = extract_features(preprocessed_data)

### Binary Classification

In [9]:
binary_ground_truth = np.array([0 if value[0] < 2 else 1 for value in preprocessed_ground_truth])

In [10]:
X = statistic_features
y = binary_ground_truth

In [11]:
try_ml_strategies = TryMLClassifierStrategy(X, y, groups=preprocessed_groups)
try_ml_strategies.try_different_strategies(group_validation = True)

Try Logistic Regression...
Groups: [['E', 'C', 'D'], ['J', 'K', 'I', 'A'], ['H', 'B', 'G', 'F']]
ROC AUC Score: [0.4207828518173346, 0.44581618655692734, 0.46972049689441]
Mean accuracy of train set: 0.7709077253193818
Mean accuracy of test set: 0.7396081001547862
--------------------------------------
Try Random Forests...
Groups: [['E', 'C', 'D'], ['J', 'K', 'I', 'A'], ['H', 'B', 'G', 'F']]
ROC AUC Score: [0.3463031997514755, 0.43941472336534065, 0.33880046583850937]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.7110820628600019
25 features remain
------After ensemble feature selection------
Groups: [['E', 'C', 'D'], ['J', 'K', 'I', 'A'], ['H', 'B', 'G', 'F']]
ROC AUC Score: [0.3487107797452625, 0.43021262002743477, 0.3962538819875776]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.7051066687617243
--------------------------------------
Try Extra Trees Classifier...
Groups: [['E', 'C', 'D'], ['J', 'K', 'I', 'A'], ['H', 'B', 'G', 'F']]
ROC AUC Score: [0.38

### Detailed level-of-stress Classification

In [12]:
multi_label_ground_truth = np.array([value[0] for value in preprocessed_ground_truth])

In [13]:
X = statistic_features
y = multi_label_ground_truth

In [14]:
try_ml_strategies = TryMLClassifierStrategy(X, y, groups=preprocessed_groups, multiclass=True)
try_ml_strategies.try_different_strategies(group_validation = True)

Try Logistic Regression...
Groups: [['E', 'C', 'D'], ['J', 'K', 'I', 'A'], ['H', 'B', 'G', 'F']]
Mean accuracy of train set: 0.45637704908806537
Mean accuracy of test set: 0.3799621511280921
--------------------------------------
Try Random Forests...
Groups: [['E', 'C', 'D'], ['J', 'K', 'I', 'A'], ['H', 'B', 'G', 'F']]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.35998139947099794
26 features remain
------After ensemble feature selection------
Groups: [['E', 'C', 'D'], ['J', 'K', 'I', 'A'], ['H', 'B', 'G', 'F']]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.3774732363266471
--------------------------------------
Try Extra Trees Classifier...
Groups: [['E', 'C', 'D'], ['J', 'K', 'I', 'A'], ['H', 'B', 'G', 'F']]
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.3757604921731676
29 features remain
------After ensemble feature selection------
Groups: [['E', 'C', 'D'], ['J', 'K', 'I', 'A'], ['H', 'B', 'G', 'F']]
Mean accuracy of train set: 1.0
Mean

## Person-specific stress detection model

### Binary classification

In [15]:
# Prepare personal-specific stress detection model
person_specific_dataset = defaultdict(dict)
person_specific_ground_truth = defaultdict(dict)
for participant_id, data in single_signal_data.items():
    person_specific_data, person_specific_gt = dataset_loader.prepare_person_specific_dataset(data, ground_truth)
    person_specific_dataset[participant_id] = person_specific_data
    person_specific_ground_truth[participant_id] = person_specific_gt

In [16]:
for participant_id, data in person_specific_dataset.items():
    print(f"Processing {participant_id}")
    person_specific_gt = np.array([0 if value[0] < 2 else 1 for value in person_specific_ground_truth[participant_id]])
    if len(list(set(person_specific_gt))) < 2: # If the data contains non-stress value, then continue
        continue
    person_specific_data, person_specific_gt = dataset_loader.divide_person_specific_data_into_intervals(data, person_specific_gt, num_samples=60)
    preprocessed_data, preprocessed_ground_truth, _ = preprocess_data(person_specific_data, person_specific_gt)
    person_specific_statistic_features = extract_features(preprocessed_data)
    try_ml_strategies = TryMLClassifierStrategy(person_specific_statistic_features, preprocessed_ground_truth)
    try_ml_strategies.try_different_strategies(cross_validation = True)
    print("*****************************************")

Processing A
Try Logistic Regression...
Mean accuracy of train set: 0.7416666666666666
Mean accuracy of test set: 0.4833333333333334
11 features remain
------After recursive feature elimination------
Mean accuracy of train set: 0.7166666666666668
Mean accuracy of test set: 0.6
--------------------------------------
Try Random Forests...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.6166666666666667
15 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.6666666666666666
--------------------------------------
Try Extra Trees Classifier...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.6833333333333332
19 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.7333333333333334
--------------------------------------
Try SVM Classifier...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.6166666666666667
-------------

In [17]:
for participant_id, data in person_specific_dataset.items():
    print(f"Processing {participant_id}")
    person_specific_gt = np.array([0 if value[0] < 2 else 1 for value in person_specific_ground_truth[participant_id]])
    dataset_loader.class_percentage_analysis(person_specific_gt)

Processing A
0: 0.6363636363636364
1: 0.36363636363636365
Processing B
0: 0.7272727272727273
1: 0.2727272727272727
Processing C
0: 1.0
Processing D
0: 0.45454545454545453
1: 0.5454545454545454
Processing E
0: 0.5454545454545454
1: 0.45454545454545453
Processing F
0: 0.8181818181818182
1: 0.18181818181818182
Processing G
0: 1.0
Processing H
0: 1.0
Processing I
0: 0.8181818181818182
1: 0.18181818181818182
Processing J
0: 0.5454545454545454
1: 0.45454545454545453
Processing K
0: 1.0


### Detailed level-of-stress Classification

In [18]:
for participant_id, data in person_specific_dataset.items():
    print(f"Processing {participant_id}")
    person_specific_gt = np.array([value[0] for value in person_specific_ground_truth[participant_id]])
    if len(list(set(person_specific_gt))) < 2: # If the data contains non-stress value, then continue
        continue
    person_specific_data, person_specific_gt = dataset_loader.divide_person_specific_data_into_intervals(data, person_specific_gt, num_samples=60)
    preprocessed_data, preprocessed_ground_truth, _ = preprocess_data(person_specific_data, person_specific_gt)
    person_specific_statistic_features = extract_features(preprocessed_data)
    try_ml_strategies = TryMLClassifierStrategy(person_specific_statistic_features, preprocessed_ground_truth)
    try_ml_strategies.try_different_strategies(cross_validation = True)
    print("*****************************************")

Processing A
Try Logistic Regression...
Mean accuracy of train set: 0.6833333333333332
Mean accuracy of test set: 0.3333333333333333
5 features remain
------After recursive feature elimination------
Mean accuracy of train set: 0.5666666666666668
Mean accuracy of test set: 0.4833333333333334
--------------------------------------
Try Random Forests...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.4333333333333333
18 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.5499999999999999
--------------------------------------
Try Extra Trees Classifier...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.4000000000000001
19 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.4333333333333333
--------------------------------------
Try SVM Classifier...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.4333333333333333

In [19]:
for participant_id, data in person_specific_dataset.items():
    print(f"Processing {participant_id}")
    person_specific_gt = np.array([value[0] for value in person_specific_ground_truth[participant_id]])
    dataset_loader.class_percentage_analysis(person_specific_gt)

Processing A
0: 0.45454545454545453
1: 0.18181818181818182
2: 0.36363636363636365
Processing B
0: 0.45454545454545453
1: 0.2727272727272727
2: 0.18181818181818182
3: 0.09090909090909091
Processing C
0: 0.6
1: 0.4
Processing D
0: 0.18181818181818182
1: 0.2727272727272727
2: 0.45454545454545453
3: 0.09090909090909091
Processing E
0: 0.2727272727272727
1: 0.2727272727272727
3: 0.2727272727272727
2: 0.18181818181818182
Processing F
0: 0.45454545454545453
1: 0.36363636363636365
3: 0.09090909090909091
2: 0.09090909090909091
Processing G
0: 0.36363636363636365
1: 0.6363636363636364
Processing H
0: 0.5
1: 0.5
Processing I
0: 0.2727272727272727
1: 0.5454545454545454
2: 0.18181818181818182
Processing J
0: 0.45454545454545453
1: 0.09090909090909091
2: 0.45454545454545453
Processing K
0: 0.9090909090909091
1: 0.09090909090909091
