In [1]:
# Add additional library
import sys, os, os.path as osp
data_lib = os.path.abspath('../data')
if data_lib not in sys.path:
    sys.path.append(data_lib)

from data_utils import *
import pickle
import numpy as np
from pathlib import Path
from tqdm import tqdm
import json
import pandas as pd
from typing import List, Dict
from collections import defaultdict

In [2]:
BVP_SAMPLING_RATE = 700
EDA_SAMPLING_RATE = 700
TEMP_SAMPLING_RATE = 700

dp_manager = get_data_path_manager()

In [3]:
def resampling_indices(indices, target_sr, original_sr):
    rate = target_sr / original_sr
    resampled_indices = [int(round(ind) * rate) for ind in indices]
    return resampled_indices

In [4]:
def assert_resampling_indices(fst_inds, fnd_inds, fsr, sst_inds, snd_inds, ssr):
    assert(len(fst_inds) == len(sst_inds))
    assert(len(fnd_inds) == len(fnd_inds))

    num_indices = len(fst_inds)
    for i in range(num_indices):
        f_len = round((fnd_inds[i] - fst_inds[i]) / fsr)
        s_len = round((snd_inds[i] - sst_inds[i]) / ssr)
        assert(f_len == s_len)

In [5]:
# Get dataset path
wesad_dataset_path = dp_manager.WESAD_dataset_path

# List of user ids
user_ids = sorted([user_id for user_id in os.listdir(wesad_dataset_path) if osp.splitext(user_id)[-1] != '.csv'])

# Objective ground-truth file path
ground_truth_path = osp.join(wesad_dataset_path, 'WESAD-Ground-Truth.csv')

In [6]:
ground_truth = pd.read_csv(ground_truth_path)
ground_truth.head()

Unnamed: 0,INSTANCE,LABEL
0,S10_baseline_1.csv,0
1,S10_amusement_2.csv,0
2,S10_meditation_3.csv,0
3,S10_stress_4.csv,1
4,S10_meditation_5.csv,0


---

# WESAD CURSOR-INDICES PREPROCESSING

In [7]:
WESAD_LABELS = ['', 'baseline', 'stress', 'amusement', 'meditation']
ORIGINAL_SAMPLING_RATE = 700
metadata_file_path = str(Path(dp_manager.dataset_path) / 'wesad_chest_metadata.json')

wesad_metadata = {}
for user_id in tqdm(user_ids):
    data_path = str(Path(dp_manager.WESAD_dataset_path) / user_id / f'{user_id}.pkl')
    data = pickle.load(open(data_path, 'rb'), encoding = 'bytes')

    labels = data[b'label'].tolist()
    labels.insert(-1, 0)
    num_labels = len(labels)
    starting_indices = []
    ending_indices = []
    gt = []
    for i in range(1, num_labels):
        if labels[i] != labels[i-1]:
            if 0 < labels[i] < 5: 
                starting_indices.append(i)
                gt.append(WESAD_LABELS[labels[i]])
            if 0 < labels[i-1] < 5: ending_indices.append(i-1)
    bvp_starting_indices = resampling_indices(starting_indices, BVP_SAMPLING_RATE, ORIGINAL_SAMPLING_RATE)
    bvp_ending_indices = resampling_indices(ending_indices, BVP_SAMPLING_RATE, ORIGINAL_SAMPLING_RATE)
    assert_resampling_indices(starting_indices, ending_indices, ORIGINAL_SAMPLING_RATE, bvp_starting_indices, bvp_ending_indices, BVP_SAMPLING_RATE) 

    eda_starting_indices = resampling_indices(starting_indices, EDA_SAMPLING_RATE, ORIGINAL_SAMPLING_RATE)
    eda_ending_indices = resampling_indices(ending_indices, EDA_SAMPLING_RATE, ORIGINAL_SAMPLING_RATE)
    assert_resampling_indices(starting_indices, ending_indices, ORIGINAL_SAMPLING_RATE, eda_starting_indices, eda_ending_indices, EDA_SAMPLING_RATE)

    wesad_metadata[user_id] = {
        'labels': gt,
        'bvp': {
            'starting_indices': bvp_starting_indices,
            'ending_indices': bvp_ending_indices
        },
        'eda': {
            'starting_indices': eda_starting_indices,
            'ending_indices': eda_ending_indices
        },
        'temp': {
            'starting_indices': eda_starting_indices, # TEMP sampling rate is the same as EDA sampling rate
            'ending_indices': eda_ending_indices
        }
    }

with open(metadata_file_path, 'w') as f:
    json.dump(wesad_metadata, f)

100%|██████████| 15/15 [00:41<00:00,  2.78s/it]


In [8]:
data[b'signal'][b'wrist'].keys()

dict_keys([b'ACC', b'BVP', b'EDA', b'TEMP'])

In [9]:
data[b'signal'][b'chest'].keys()

dict_keys([b'ACC', b'ECG', b'EMG', b'EDA', b'Temp', b'Resp'])

## Create eda and bvp signal data

In [10]:
eda = defaultdict(dict)
bvp = defaultdict(dict)
temp = defaultdict(dict)
device = 'chest'
for user_id in tqdm(user_ids):
    temp_raw_signal = load_raw_signal(dp_manager, 'WESAD', user_id, device, 'Temp')
    eda_raw_signal = load_raw_signal(dp_manager, 'WESAD', user_id, device, 'EDA')
    bvp_raw_signal = load_raw_signal(dp_manager, 'WESAD', user_id, device, 'ECG')
    user_metadata = wesad_metadata[user_id]
    for i, label in enumerate(user_metadata['labels']):
        if label == 'meditation': continue
        # Get task id for ground-truth label
        task_id = f'{user_id}_{label}_{i+1}'

        # Get raw bvp signal of the task
        starting_index = user_metadata['bvp']['starting_indices'][i]
        ending_index = user_metadata['bvp']['ending_indices'][i]
        trim_indices = (starting_index, ending_index)
        trimmed_signal = get_trimmed_signal(bvp_raw_signal, trim_indices, lag = 0, sampling_rate = BVP_SAMPLING_RATE)
        bvp[user_id][task_id] = trimmed_signal

        # Get raw eda signal of the task
        starting_index = user_metadata['eda']['starting_indices'][i]
        ending_index = user_metadata['eda']['ending_indices'][i]
        trim_indices = (starting_index, ending_index)
        trimmed_signal = get_trimmed_signal(eda_raw_signal, trim_indices, lag = 0, sampling_rate = EDA_SAMPLING_RATE)
        eda[user_id][task_id] = trimmed_signal

        # Get raw skin temperature signal of the task
        trimmed_signal = get_trimmed_signal(temp_raw_signal, trim_indices, lag = 0, sampling_rate = TEMP_SAMPLING_RATE)
        temp[user_id][task_id] = trimmed_signal

100%|██████████| 15/15 [01:38<00:00,  6.58s/it]


## Map ground-truth and save data

In [11]:
def dump_dataset_pickle(eda: Dict[str, Dict[str, List[float]]], bvp: Dict[str, Dict[str, List[float]]], temp: Dict[str, Dict[str, List[float]]], ground_truth: Dict[str, Dict[str, List[int]]], file_path: str):
    data = { 'eda': eda, 'bvp': bvp, 'temp': temp, 'ground_truth': ground_truth }
    pickle.dump(data, open(file_path, 'wb'))


In [23]:
def map_data_groundtruth(dataset: Dict[str, Dict[str, List[float]]], ground_truth: pd.DataFrame) -> Dict[str, Dict[str, List[int]]]:
    ground_truth = ground_truth.set_index('INSTANCE') # Set index of the ground-truth file to task_id for retrieval
    gt = defaultdict(dict)
    for user_id, data in tqdm(dataset.items()):
        for task_id, eda_signal in data.items():
            _task_id = task_id + '.csv'
            task_ground_truth = ground_truth.loc[_task_id].values # Get task ground-truth
            len_eda_signal = len(eda_signal)
            # gt[user_id][task_id] = task_ground_truth.tolist() * len_eda_signal # Duplicate ground-truth to label each eda signal
            gt[user_id][task_id] = task_ground_truth[0] # Duplicate ground-truth to label each eda signal
    return gt

In [24]:
def data_assertion(dataset: Dict[str, Dict[str, List[float]]], ground_truth: Dict[str, Dict[str, List[int]]]):
    for user_id, data in tqdm(dataset.items()):
        for task_id, eda_signal in data.items():
            _task_id = task_id + '.csv'
            len_eda_signal = len(eda_signal)
            len_gt = len(ground_truth[user_id][task_id])
            # Assert the length of the ground-truth == the length of eda signal
            if len_eda_signal != len_gt:
                print(user_id, task_id, 'Length not equal')
                print(len_eda_signal, len_gt)
            # Assert if the signal has missing values?
            if any(elem is None for elem in eda_signal):
                print(user_id, task_id, 'Has None value')

In [25]:
print("Map ground-truth to each data signal")
gt = map_data_groundtruth(eda, ground_truth)

Map ground-truth to each data signal


100%|██████████| 15/15 [00:00<00:00, 4037.13it/s]


In [26]:
# Assert that the data is correct
data_assertion(eda, gt)

  0%|          | 0/15 [00:00<?, ?it/s]


TypeError: object of type 'numpy.int64' has no len()

In [27]:
# Save dataset
output_file_path = osp.join(dp_manager.dataset_path, 'WESAD_chest.pkl')
dump_dataset_pickle(eda, bvp, temp, gt, output_file_path)