In [1]:
import configparser
import os
import os.path as osp
import pandas as pd
from collections import defaultdict
import pickle

# Initialize data paths

In [2]:
# Get dataset path
config_path = osp.join(osp.abspath(os.pardir), 'config.ini')
parser = configparser.ConfigParser()
parser.read(config_path)
affectiveROAD_dataset_path = parser['DATA_PATH']['affectiveROAD_dataset_path']

# EDA E4 Wearable devices data path (sampling-rate = 4 Hz)
e4_folder_path = osp.join(affectiveROAD_dataset_path, 'E4')
e4_annot_right_path = osp.join(e4_folder_path, 'Annot_E4_Right.csv')
# Get folder name of each users' data
user_data_folder = [name for name in os.listdir(e4_folder_path) if osp.isdir(osp.join(e4_folder_path, name))]
# List of user ids
user_ids = [name.split('-')[-1] for name in user_data_folder]

# Subjective ground-truth path
subj_metric_path = osp.join(affectiveROAD_dataset_path, 'Subj_metric')
# Overall Annotation of subjective metric
anno_subj_metric_path = osp.join(subj_metric_path, 'Annot_Subjective_metric.csv')
# Map ground-truth file to each user corresponding to $user_ids
user_anno_gt_path = [f'SM_{user_id}.csv' for user_id in user_ids]

# Process dataset to a predefined format

In [3]:
# Load e4_right annotation
e4_annot_right = pd.read_csv(e4_annot_right_path, sep=';')
e4_annot_right


Unnamed: 0,User_id,Rest_Start,Rest_End,Z_Start,Z_End,City1_Start,City1_End,Hwy_Start,Hwy_End,City2_Start,...,City2_Start.1,City2_End.1,Hwy_Start.1,Hwy_End.1,City1_Start.1,City1_End.1,Z_Start.1,Z_End.1,Rest_Start.1,Rest_End.1
0,NM1,688,4288,4580,5248,5296,6408,7088,8208,8728,...,11608,13564,13676,14904,15432,16408,16484,17136,17248,20848
1,RY1,10972,14204,14560,15016,15260,16348,16752,17748,18384,...,21196,23108,23200,24380,24868,25788,25904,26420,26532,28532
2,BK1,3728,7356,7352,8320,8412,9348,9864,11076,11420,...,14040,16240,16356,17476,18392,19252,19340,20296,20284,23936
3,MT1,2388,5988,6420,7164,7308,9080,9668,10900,11332,...,13756,15884,16100,17348,17812,19016,19084,19940,20024,23572
4,EK1,2656,6260,6548,7200,7472,9576,10284,11376,11884,...,14336,16156,16576,17560,18088,19628,19656,20296,21452,25216
5,RY2,2624,6224,7496,8096,8124,9544,9992,11056,11472,...,13928,16032,16164,17264,17672,18852,18888,19584,19868,23468
6,KSG1,3712,7312,7632,8352,8432,9432,9724,10880,11312,...,14520,16632,16744,17504,18304,19112,19320,19952,20512,24112
7,NM2,3988,7588,7768,8520,8588,10888,11328,12648,12992,...,15720,17508,17608,18968,19428,20448,20608,21248,21648,25248
8,AD1,3756,7356,7432,8132,8296,9272,9752,10816,11188,...,13432,15412,15520,16632,16952,17632,17752,18748,18972,22572
9,GM1,4328,8320,9316,9868,10060,11428,11928,13120,13464,...,16168,18208,18288,19496,20144,21088,21128,21688,22208,25980


In [4]:
# Load subjective metric annotation
subj_metric_anno = pd.read_csv(anno_subj_metric_path, sep=';')
subj_metric_anno

Unnamed: 0,User_id,Z_Start,Z_End,City1_Start,City1_End,Hwy_Start,Hwy_End,City2_Start,City2_End,City2_Start.1,City2_End.1,Hwy_Start.1,Hwy_End.1,City1_Start.1,City1_End.1,Z_Start.1,Z_End.1
0,NM1,1,576,624,1736,2416,3536,4056,6628,6936,8892,9004,10232,10760,11736,11812,12464
1,RY1,1,252,496,1584,1988,2984,3620,6008,6432,8344,8436,9616,10104,11024,11140,11656
2,BK1,1,612,704,1640,2156,3368,3712,5872,6332,8532,8648,9768,10684,11544,11632,12588
3,MT1,1,544,688,2460,3048,4280,4712,6740,7136,9264,9480,10728,11192,12396,12464,13320
4,EK1,1,468,740,2844,3552,4644,5152,7284,7604,9424,9844,10828,11356,12896,12924,13564
5,RY2,1,140,168,1588,2036,3100,3516,5628,5972,8076,8208,9308,9716,10896,10932,11628
6,KSG1,1,316,396,1396,1688,2844,3276,5836,6484,8596,8708,9468,10268,11076,11284,11916
7,NM2,1,360,428,2728,3168,4488,4832,6944,7560,9348,9448,10808,11268,12288,12448,13088
8,AD1,1,448,612,1588,2068,3132,3504,5348,5748,7728,7836,8948,9268,9948,10068,11064
9,GM1,128,680,872,2240,2740,3932,4276,6580,6980,9020,9100,10308,10956,11900,11940,12500


In [5]:
# Format EDA signal into a predefined format: Dictionary[user_id][route]
eda = defaultdict(dict)
headers = e4_annot_right.columns
num_routes = (len(headers) - 1) >> 1
for user_index, row in enumerate(e4_annot_right.values):
    user_id = row[0]
    # Load e4 eda data of $user_id
    user_eda_path = osp.join(e4_folder_path, user_data_folder[user_index], 'EDA.csv')
    eda_signal = [float(line.rstrip()) for line in open(user_eda_path, 'r').readlines()][2:] # Remove first and second row as it contains metadata of the signal
    for _iter in range(num_routes):
        index = (_iter << 1) + 1
        route_name = headers[index].split('_')[0]
        if route_name != 'Z':
            start_index, end_index = row[index:index+2]
            if route_name in eda[user_id].keys():
                route_name += '_Return' # Return phase
            eda[user_id][route_name] = eda_signal[start_index:end_index]

In [6]:
# Format ground-truth into a predefined format: Dictionary[user_id][route]
ground_truth = defaultdict(dict)
headers = subj_metric_anno.columns
num_routes = (len(headers) - 1) >> 1
for user_index, row in enumerate(subj_metric_anno.values):
    user_id = row[0]
    # Load ground-truth of $user_id
    user_gt_path = osp.join(subj_metric_path, user_anno_gt_path[user_index])
    user_gt_anno = [line.rstrip() for line in open(user_gt_path, 'r').readlines()][1:] # Remove first row as it contains metadata of the signal
    user_gt_anno = list(map(float, user_gt_anno))
    for _iter in range(num_routes):
        index = (_iter << 1) + 1
        route_name = headers[index].split('_')[0]
        if route_name != 'Z':
            start_index, end_index = row[index:index+2]
            if route_name in ground_truth[user_id].keys():
                route_name += '_Return' # Return phase
            ground_truth[user_id][route_name] = user_gt_anno[start_index:end_index]
            

# Add ground-truth for Rest parts
for user_id in user_ids:
    ground_truth[user_id]['Rest'] = [0 for _ in range(len(eda[user_id]['Rest']))]
    ground_truth[user_id]['Rest_Return'] = [0 for _ in range(len(eda[user_id]['Rest_Return']))]


THRESHOLD = 0.4 # Threshold to determine if the label is stress or not as the ground-truth is continuous value
# Siirtola P, Röning J. Comparison of Regression and Classification Models for User-Independent and Personal Stress Detection. Sensors (Basel). 2020 Aug 7;20(16):4402. doi: 10.3390/s20164402. PMID: 32784547; PMCID: PMC7472084.
for user_id in user_ids:
    route_names = ground_truth[user_id].keys()
    for route_name in route_names:
        ground_truth[user_id][route_name] = [0 if value < THRESHOLD else 1 for value in ground_truth[user_id][route_name]]


In [7]:
# Assert the length of the ground-truth and the eda signal
for user_id in user_ids:
    route_names = ground_truth[user_id].keys()
    for route_name in route_names:
        eda_duration = len(eda[user_id][route_name])
        gt_duration = len(ground_truth[user_id][route_name])
        if eda_duration != gt_duration:
            print(user_id, route_name)
            print(eda_duration, gt_duration)

In [9]:
data = { 'eda': eda, 'ground_truth': ground_truth }
output_file_path = osp.join(affectiveROAD_dataset_path, 'affectiveROAD_dataset.pkl')
pickle.dump(data, open(output_file_path, 'wb'))
