In [1]:
import numpy as np
import os
import os.path as osp
import neurokit2 as nk
from DatasetLoader.dataset_loader import DatasetLoader
from DatasetLoader.signal_processing import *
import configparser
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV, SelectFromModel
import matplotlib.pyplot as plt
from Classifiers.classifier_strategy import TryMLClassifierStrategy
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
from collections import defaultdict
from typing import Tuple

In [2]:
dataset_loader = DatasetLoader()
collected_gsr_data, ground_truth = dataset_loader.load_collected_gsr_dataset()

In [3]:
# Aggrgate dataset to seconds
agg_gsr_data = dataset_loader.aggregate_gsr_dataset(collected_gsr_data, ['MICROSIEMENS', 'SCR', 'SCR/MIN'])
# Divide the data into intervals
agg_interval_gsr_data, agg_interval_ground_truth, agg_interval_group = dataset_loader.divide_into_intervals(agg_gsr_data, ground_truth, 60)

In [4]:
agg_interval_gsr_data = select_single_signal(agg_interval_gsr_data, 0)

## General cross-population stress detection model

### Aggregated data + One-minute interval split

In [5]:
flatten_agg_data = dataset_loader.flatten(agg_interval_gsr_data)
flatten_agg_ground_truth = dataset_loader.flatten(agg_interval_ground_truth)
flatten_agg_group = dataset_loader.flatten(agg_interval_group)
print(f"Number of subjects: {len(list(set(flatten_agg_group)))}")
print(f"Number of samples: {len(flatten_agg_data)}")

Number of subjects: 11
Number of samples: 639


In [6]:
def preprocess_data(flatten_agg_data: np.array, flatten_agg_ground_truth: np.array, flatten_agg_group: np.array = None, sampling_rate: int = 5) -> Tuple[ np.array, np.array, np.array ]:
    # Remove the cut with less than 50 values
    filtered_agg_index = [item_index for item_index, microsiemens in enumerate(flatten_agg_data) if len(microsiemens) >= 50]
    flatten_agg_data = flatten_agg_data[filtered_agg_index]
    flatten_agg_ground_truth = flatten_agg_ground_truth[filtered_agg_index]
    if flatten_agg_group is not None:
        flatten_agg_group = flatten_agg_group[filtered_agg_index]

    # Fill the missing values by constant = 0 (due to device error during data collection process)
    imp_constant = SimpleImputer(strategy='constant', fill_value=0)
    imputed_agg_data = np.array([imp_constant.fit_transform(microsiemens.reshape(-1, 1)).flatten() for microsiemens in flatten_agg_data])

    # Normalize the features before processing
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaled_agg_data = np.array([scaler.fit_transform(microsiemens.reshape(-1, 1)).flatten() for microsiemens in imputed_agg_data])
    return scaled_agg_data, flatten_agg_ground_truth, flatten_agg_group


In [7]:
def extract_features(preprocessed_data: np.array, sampling_rate: int = 5) -> np.array:
    # Extract features from the data
    processed_features = [extract_gsr_features(microsiemens, sampling_rate=sampling_rate) for microsiemens in preprocessed_data]

    # Extract statistic features
    statistic_features = np.array([statistics_gsr_signal_features(feat) for feat in processed_features])
    return statistic_features

In [8]:
preprocessed_data, preprocessed_ground_truth, preprocessed_groups = preprocess_data(flatten_agg_data, flatten_agg_ground_truth, flatten_agg_group=flatten_agg_group)
statistic_features = extract_features(preprocessed_data)

### Binary Classification

In [9]:
binary_ground_truth = np.array([0 if value[0] < 2 else 1 for value in preprocessed_ground_truth])

In [10]:
X = statistic_features
y = binary_ground_truth

In [11]:
try_ml_strategies = TryMLClassifierStrategy(X, y, groups=preprocessed_groups)
try_ml_strategies.try_different_strategies(group_validation = True)

### Detailed level-of-stress Classification

In [12]:
multi_label_ground_truth = np.array([value[0] for value in preprocessed_ground_truth])

In [13]:
X = statistic_features
y = multi_label_ground_truth

In [14]:
try_ml_strategies = TryMLClassifierStrategy(X, y, groups=preprocessed_groups, multiclass=True)
try_ml_strategies.try_different_strategies(group_validation = True)

Try Logistic Regression...
Mean accuracy of train set: 0.46309590974025444
Mean accuracy of test set: 0.3959981296096439
--------------------------------------
Try Random Forests...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.36560979963253787
13 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.3889948478119741
--------------------------------------
Try Extra Trees Classifier...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.36843933360624215
13 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.3862223421367108
--------------------------------------
Try SVM Classifier...
Mean accuracy of train set: 0.46396284151821715
Mean accuracy of test set: 0.41823485872832755
--------------------------------------
Try MLPClassifier...
Mean accuracy of train set: 0.49270179906356226
Mean accuracy of test set: 0.39445371369657

## Person-specific stress detection model

### Binary classification

In [15]:
# Prepare personal-specific stress detection model
person_specific_dataset = defaultdict(dict)
person_specific_ground_truth = defaultdict(dict)
for participant_id, data in agg_gsr_data.items():
    person_specific_data, person_specific_gt = dataset_loader.prepare_person_specific_dataset(data, ground_truth)
    person_specific_dataset[participant_id] = person_specific_data
    person_specific_ground_truth[participant_id] = person_specific_gt

In [16]:
for participant_id, data in person_specific_dataset.items():
    print(f"Processing {participant_id}")
    person_specific_gt = np.array([0 if value[0] < 2 else 1 for value in person_specific_ground_truth[participant_id]])
    if len(list(set(person_specific_gt))) < 2: # If the data contains non-stress value, then continue
        continue
    person_specific_data, person_specific_gt = dataset_loader.divide_person_specific_data_into_intervals(data, person_specific_gt, num_samples=60)
    preprocessed_data, preprocessed_ground_truth, _ = preprocess_data(person_specific_data, person_specific_gt)
    person_specific_statistic_features = extract_features(preprocessed_data)
    try_ml_strategies = TryMLClassifierStrategy(person_specific_statistic_features, preprocessed_ground_truth)
    try_ml_strategies.try_different_strategies(cross_validation = True)
    print("*****************************************")

Processing A
Try Logistic Regression...
Mean accuracy of train set: 0.75
Mean accuracy of test set: 0.7000000000000001
7 features remain
------After recursive feature elimination------
Mean accuracy of train set: 0.75
Mean accuracy of test set: 0.7000000000000001
--------------------------------------
Try Random Forests...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.6
12 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.5833333333333334
--------------------------------------
Try Extra Trees Classifier...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.6166666666666666
12 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.6166666666666667
--------------------------------------
Try SVM Classifier...
Mean accuracy of train set: 0.8333333333333334
Mean accuracy of test set: 0.6833333333333332
---------------------------

### Detailed level-of-stress Classification

In [18]:
for participant_id, data in person_specific_dataset.items():
    print(f"Processing {participant_id}")
    person_specific_gt = np.array([value[0] for value in person_specific_ground_truth[participant_id]])
    if len(list(set(person_specific_gt))) < 2: # If the data contains non-stress value, then continue
        continue
    person_specific_data, person_specific_gt = dataset_loader.divide_person_specific_data_into_intervals(data, person_specific_gt, num_samples=60)
    preprocessed_data, preprocessed_ground_truth, _ = preprocess_data(person_specific_data, person_specific_gt)
    person_specific_statistic_features = extract_features(preprocessed_data)
    try_ml_strategies = TryMLClassifierStrategy(person_specific_statistic_features, preprocessed_ground_truth)
    try_ml_strategies.try_different_strategies(cross_validation = True)
    print("*****************************************")

Processing A
Try Logistic Regression...
Mean accuracy of train set: 0.5750000000000001
Mean accuracy of test set: 0.5499999999999999
1 features remain
------After recursive feature elimination------
Mean accuracy of train set: 0.6
Mean accuracy of test set: 0.6166666666666667
--------------------------------------
Try Random Forests...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.5166666666666667
13 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.46666666666666673
--------------------------------------
Try Extra Trees Classifier...
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.43333333333333335
13 features remain
------After ensemble feature selection------
Mean accuracy of train set: 1.0
Mean accuracy of test set: 0.45
--------------------------------------
Try SVM Classifier...
Mean accuracy of train set: 0.7416666666666666
Mean accuracy of test set: 0.6
--------------------------