# Imports

In [1]:
import json
import src.configs as cfg
import src.preprocessing as pp
import src.feature_extraction as fe
import pandas as pd
from tqdm import tqdm
import neurokit2 as nk
import h5py

# Format JSON Input to Single DataFrame

In [2]:
# load json example
ecg_batch = json.loads("""{
                "supervisor": "Lieschen Mueller",
                "record_date": "2034-01-16",
                "configs": {
                    "device_name": "bioplux",
                    "frequency": 360,
                    "signal": "chest",
                    "window_slicing_method": "time_related",
                    "window_size": 5.0
                },
                "samples": [
                    {
                      "sample_id": "f70c1033-36ae-4b8b-8b89-099a96dccca5",
                      "subject_id": "participant_1",
                      "timestamp_idx": [1679709871, 1679713471, 1679720671, 1679724071, 1679727471, 1679730871, 1679734271, 1679737671, 1679741071, 1679744471, 1679747871, 1679751271, 1679754671],
                      "ecg": [1.0, -1.100878, -3.996840, 0.5, -2.345, 1.234, -0.987, 2.345, -1.234, 0.5, 1.0, 1.0, 1.0],
                      "label": ["undefined", "stress", "undefined", "undefined", "calm", "undefined", "stress", "calm", "undefined", "stress", "undefined", "undefined", "undefined"]
                    },
                    {
                      "sample_id": "sample_id_2",
                      "subject_id": "participant_2",
                      "timestamp_idx": [1679709871, 1679713471, 1679720671, 1679724071, 1679727471, 1679730871, 1679734271, 1679737671, 1679741071, 1679744471, 1679747871, 1679751271, 1679754671],
                      "ecg": [1.2, -1.567, -3.456, 0.7, -2.789, 1.876, -0.345, 2.567, -1.876, 0.7, 1.0, 1.0, 1.0],
                      "label": ["undefined", "stress", "undefined", "calm", "undefined", "stress", "calm", "undefined", "stress", "undefined", "undefined", "undefined", "undefined"]
                    },
                    {
                      "sample_id": "sample_id_3",
                      "subject_id": "participant_3",
                      "timestamp_idx": [1679709871, 1679713471, 1679720671, 1679724071, 1679727471, 1679730871, 1679734271, 1679737671, 1679741071, 1679744471, 1679747871, 1679751271, 1679754671],
                      "ecg": [0.8, -1.234, -3.789, 0.9, -2.567, 1.456, -0.234, 2.789, -1.456, 0.9, 1.0, 1.0, 1.0],
                      "label": ["undefined", "calm", "stress", "undefined", "stress", "undefined", "calm", "undefined", "stress", "undefined", "undefined", "undefined", "undefined"]
                    }
                  ]
            }""")

In [3]:
# to pandas
ecg_batch_df = pd.json_normalize(ecg_batch, record_path=['samples'], meta=['supervisor', 'record_date', ['configs', 'device_name'], ['configs', 'frequency'], ['configs', 'signal'], ['configs', 'window_slicing_method'], ['configs', 'window_size']])
# explode every list colmn to rows
ecg_batch_df = ecg_batch_df.explode(['timestamp_idx', 'ecg', 'label'])

In [4]:
ecg_batch_df

Unnamed: 0,sample_id,subject_id,timestamp_idx,ecg,label,supervisor,record_date,configs.device_name,configs.frequency,configs.signal,configs.window_slicing_method,configs.window_size
0,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,1679709871,1.0,undefined,Lieschen Mueller,2034-01-16,bioplux,360,chest,time_related,5.0
0,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,1679713471,-1.100878,stress,Lieschen Mueller,2034-01-16,bioplux,360,chest,time_related,5.0
0,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,1679720671,-3.99684,undefined,Lieschen Mueller,2034-01-16,bioplux,360,chest,time_related,5.0
0,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,1679724071,0.5,undefined,Lieschen Mueller,2034-01-16,bioplux,360,chest,time_related,5.0
0,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,1679727471,-2.345,calm,Lieschen Mueller,2034-01-16,bioplux,360,chest,time_related,5.0
0,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,1679730871,1.234,undefined,Lieschen Mueller,2034-01-16,bioplux,360,chest,time_related,5.0
0,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,1679734271,-0.987,stress,Lieschen Mueller,2034-01-16,bioplux,360,chest,time_related,5.0
0,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,1679737671,2.345,calm,Lieschen Mueller,2034-01-16,bioplux,360,chest,time_related,5.0
0,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,1679741071,-1.234,undefined,Lieschen Mueller,2034-01-16,bioplux,360,chest,time_related,5.0
0,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,1679744471,0.5,stress,Lieschen Mueller,2034-01-16,bioplux,360,chest,time_related,5.0


# Open the HDF5 file of self acquired data

In [5]:
# Open the HDF5 file
file = h5py.File('../data/opensignals_0007804c285f_2023-05-23_22-52-57.h5', 'r')
# Inspection of .h5 file internal structure/groups (in this case the mac address list of the used devices for acquiring data)
list(file.keys())

['00:07:80:4C:28:5F']

In [6]:
# Access to the second hierarchy level through group key "00:07:80:3B:46:61"
h5_group = file.get('00:07:80:4C:28:5F')
print ("Second hierarchy level: " + str(list(h5_group)))
# Identification of h5_group metadata attributes
print("Metadata of h5_group: \n" + str(list(h5_group.attrs.keys())))

Second hierarchy level: ['digital', 'events', 'plugin', 'raw', 'support']
Metadata of h5_group: 
['channels', 'comments', 'date', 'device', 'device connection', 'device name', 'digital IO', 'duration', 'firmware version', 'keywords', 'macaddress', 'mode', 'nsamples', 'resolution', 'sampling rate', 'sync interval', 'time']


In [7]:
# Access to the third level of data through group key "00:07:80:3B:46:61" and sub-group key "raw"
h5_sub_group = h5_group.get("raw")
print("Third hierarchy level: " + str(list(h5_sub_group)))

Third hierarchy level: ['channel_1', 'channel_2', 'nSeq']


In [8]:
%%time
# Extract the timestamp-related details
sampling_rate = h5_group.attrs['sampling rate']
date = h5_group.attrs['date']
time = h5_group.attrs['time']

# Transposition of "channel_1" and "channel_2" dataset to Python lists. These sub-groups contain the raw data of the ECG and BVP channels respectively.
ecg = [float(x) for x in h5_sub_group.get("channel_1")]
bvp = [float(x) for x in h5_sub_group.get("channel_2")]

# Create the timestamp column
num_samples = len(ecg)
timestamps = pd.date_range(start=date + ' ' + time, periods=num_samples, freq=f'{1/sampling_rate}S')

# Map the timestamp column to the data
data_with_timestamp = pd.DataFrame({'timestamp_idx': timestamps,
                                    'ecg': ecg,
                                    'bvp': bvp,
                                    'label': ['undefined'] * num_samples,
                                    'sample_id': 'sample_id_1',
                                    'subject_id': 'self_acquired'})
# Cut of the first 10 minutes of the data
data_with_timestamp = data_with_timestamp[data_with_timestamp['timestamp_idx'] >= data_with_timestamp['timestamp_idx'][0] + pd.Timedelta(minutes=10)]

# Close the file
file.close()

CPU times: total: 9.44 s
Wall time: 11.2 s


In [9]:
data_with_timestamp

Unnamed: 0,timestamp_idx,ecg,bvp,label,sample_id,subject_id
600000,2023-05-23 23:03:01.335,4047.0,35744.0,undefined,sample_id_1,self_acquired
600001,2023-05-23 23:03:01.336,3952.0,35592.0,undefined,sample_id_1,self_acquired
600002,2023-05-23 23:03:01.337,4342.0,35350.0,undefined,sample_id_1,self_acquired
600003,2023-05-23 23:03:01.338,3586.0,35064.0,undefined,sample_id_1,self_acquired
600004,2023-05-23 23:03:01.339,3059.0,34760.0,undefined,sample_id_1,self_acquired
...,...,...,...,...,...,...
2806345,2023-05-23 23:39:47.680,526.0,21264.0,undefined,sample_id_1,self_acquired
2806346,2023-05-23 23:39:47.681,458.0,21304.0,undefined,sample_id_1,self_acquired
2806347,2023-05-23 23:39:47.682,2258.0,21122.0,undefined,sample_id_1,self_acquired
2806348,2023-05-23 23:39:47.683,5684.0,21192.0,undefined,sample_id_1,self_acquired


## Window Slicing and Processing

In [10]:
def create_windows(df, time_column, window_size=5.0, window_slicing_method='time_related'):
    """
    Slices a dataframe into windows of a given size. The windows can be sliced in different ways. The windows are returned as a generator of dataframes. The dataframe must have a column containing timestamps and be indexed by it.

    :param df: The dataframe to slice.
    :type df: pandas.DataFrame
    :param time_column: The name of the column containing the timestamps.
    :type time_column: str
    :param window_size: The size of the windows in seconds.
    :type window_size: int
    :param window_slicing_method: The method used to slice the windows.
    :type window_slicing_method: str

    :return: A generator of dataframes containing the windows.
    :rtype: generator
    """
    # Convert the timestamp column to datetime if it's not already
    if not pd.api.types.is_datetime64_ns_dtype(df[time_column]):
        df[time_column] = pd.to_datetime(df[time_column])

    # Slice the dataframe into windows
    if window_slicing_method == 'time_related':
        # Resample the dataframe every x seconds
        result_dfs = [group for _, group in df.groupby(pd.Grouper(key=time_column, freq=f'{window_size}S'))]
        return result_dfs
    elif window_slicing_method == 'label_related_before':
        pass
    elif window_slicing_method == 'label_related_after':
        pass
    elif window_slicing_method == 'label_related_centered':
        pass
    else:
        raise ValueError(f'window_slicing_method {window_slicing_method} not supported')

In [11]:
%%time
# test window slicing on self sample
df = data_with_timestamp.copy()
# preprocess ecg
df['ecg'] = nk.ecg_clean(data_with_timestamp['ecg'], sampling_rate=sampling_rate, method="pantompkins1985")
# slice in windows (window_size and window_slicing_method)
windows = create_windows(df, time_column='timestamp_idx', window_size=10.0)
print(f'Number of windows: {len(list(windows))}')
# compute ecg features vor each window
features_df = pd.DataFrame()
for i, window in enumerate(windows):
    # compute features
    features = fe.hrv_features(window['ecg'].values, sampling_rate)
    tmp = pd.DataFrame(features, index=[0])
    tmp['sample_id'] = df['sample_id'].unique()
    tmp['subject_id'] = df['subject_id'].unique()
    tmp['window_id'] = i
    tmp['w_start_time'] = window['timestamp_idx'].min()
    tmp['W_end_time'] = window['timestamp_idx'].max()
    # combine features
    features_df = pd.concat([features_df, tmp], axis=0)
    features_df.reset_index(drop=True, inplace=True)

Number of windows: 221


  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio 

CPU times: total: 18.3 s
Wall time: 32.4 s


  lf_hf_ratio = lf_band / hf_band


In [19]:
features_df

Unnamed: 0,mean_rr,sdnn,rmssd,nn50,pnn50,lf_band,hf_band,lf_hf_ratio,sample_id,subject_id,window_id,w_start_time,W_end_time
0,0.405810,0.198763,0.287096,19,0.904762,0.0,0.0,,sample_id_1,self_acquired,0,2023-05-23 23:03:01.335,2023-05-23 23:03:09.999
1,0.453048,0.230953,0.345653,18,0.857143,0.0,0.0,,sample_id_1,self_acquired,1,2023-05-23 23:03:10.000,2023-05-23 23:03:19.999
2,0.421174,0.198847,0.228372,19,0.826087,0.0,0.0,,sample_id_1,self_acquired,2,2023-05-23 23:03:20.000,2023-05-23 23:03:29.999
3,0.425455,0.167115,0.262425,20,0.909091,0.0,0.0,,sample_id_1,self_acquired,3,2023-05-23 23:03:30.000,2023-05-23 23:03:39.999
4,0.373692,0.154035,0.229737,23,0.884615,0.0,0.0,,sample_id_1,self_acquired,4,2023-05-23 23:03:40.000,2023-05-23 23:03:49.999
...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,0.610200,0.377412,0.464918,12,0.800000,0.0,0.0,,sample_id_1,self_acquired,216,2023-05-23 23:39:00.000,2023-05-23 23:39:09.999
217,0.410739,0.195539,0.305365,20,0.869565,0.0,0.0,,sample_id_1,self_acquired,217,2023-05-23 23:39:10.000,2023-05-23 23:39:19.999
218,0.512882,0.263666,0.347918,12,0.705882,0.0,0.0,,sample_id_1,self_acquired,218,2023-05-23 23:39:20.000,2023-05-23 23:39:29.999
219,1.055444,0.058585,0.069406,6,0.666667,0.0,0.0,,sample_id_1,self_acquired,219,2023-05-23 23:39:30.000,2023-05-23 23:39:39.999


In [15]:
features_df.to_json('../data/features.json', orient='records')