# Imports

In [3]:
import json
import src.configs as cfg
import src.preprocessing as pp
import src.feature_extraction as fe
import pandas as pd
from tqdm import tqdm
import neurokit2 as nk
import h5py

# HDF5 file -> Pandas DataFrame

In [4]:
# Open the HDF5 file
file = h5py.File('../data/opensignals_0007804c285f_2023-05-23_22-52-57.h5', 'r')
# Inspection of .h5 file internal structure/groups (in this case the mac address list of the used devices for acquiring data)
list(file.keys())

['00:07:80:4C:28:5F']

In [5]:
# Access to the second hierarchy level through group key "00:07:80:3B:46:61"
h5_group = file.get('00:07:80:4C:28:5F')
print ("Second hierarchy level: " + str(list(h5_group)))
# Identification of h5_group metadata attributes
print("Metadata of h5_group: \n" + str(list(h5_group.attrs.keys())))

Second hierarchy level: ['digital', 'events', 'plugin', 'raw', 'support']
Metadata of h5_group: 
['channels', 'comments', 'date', 'device', 'device connection', 'device name', 'digital IO', 'duration', 'firmware version', 'keywords', 'macaddress', 'mode', 'nsamples', 'resolution', 'sampling rate', 'sync interval', 'time']


In [6]:
# Access to the third level of data through group key "00:07:80:3B:46:61" and sub-group key "raw"
h5_sub_group = h5_group.get("raw")
print("Third hierarchy level: " + str(list(h5_sub_group)))

Third hierarchy level: ['channel_1', 'channel_2', 'nSeq']


### Add timestamp column

In [7]:
%%time
# Extract the timestamp-related details
sampling_rate = h5_group.attrs['sampling rate']
date = h5_group.attrs['date']
time = h5_group.attrs['time']

# Transposition of "channel_1" and "channel_2" dataset to Python lists. These sub-groups contain the raw data of the ECG and BVP channels respectively.
ecg = [float(x) for x in h5_sub_group.get("channel_1")]
bvp = [float(x) for x in h5_sub_group.get("channel_2")]

# Create the timestamp column
num_samples = len(ecg)
timestamps = pd.date_range(start=date + ' ' + time, periods=num_samples, freq=f'{1/sampling_rate}S')

# Map the timestamp column to the data and add other columns
data_with_timestamp = pd.DataFrame({'timestamp_idx': timestamps,
                                    'ecg': ecg,
                                    #'bvp': bvp
                                    })
# Cut of the first 10 minutes of the data
data_with_timestamp = data_with_timestamp[data_with_timestamp['timestamp_idx'] >= data_with_timestamp['timestamp_idx'][0] + pd.Timedelta(minutes=10)]
data_with_timestamp.reset_index(inplace=True, drop=True)
# convert to string so that it is json serializable
data_with_timestamp['timestamp_idx'] = data_with_timestamp['timestamp_idx'].astype(str)

# Close the file
file.close()

CPU times: total: 4.61 s
Wall time: 16.2 s


In [8]:
data_with_timestamp

Unnamed: 0,timestamp_idx,ecg
0,2023-05-23 23:03:01.335,4047.0
1,2023-05-23 23:03:01.336,3952.0
2,2023-05-23 23:03:01.337,4342.0
3,2023-05-23 23:03:01.338,3586.0
4,2023-05-23 23:03:01.339,3059.0
...,...,...
2206345,2023-05-23 23:39:47.680,526.0
2206346,2023-05-23 23:39:47.681,458.0
2206347,2023-05-23 23:39:47.682,2258.0
2206348,2023-05-23 23:39:47.683,5684.0


### Add other columns

In [9]:
df_selfacq = data_with_timestamp.copy()
num_samples = len(df_selfacq)

df_selfacq['label'] = ['undefined'] * num_samples
df_selfacq['sample_id'] = 'f70c1033-36ae-4b8b-8b89-099a96dccca5'
df_selfacq['subject_id'] = 'participant_1'

df_selfacq['supervisor'] = 'Lieschen Mueller'
df_selfacq['record_date'] = '2034-01-16'

df_selfacq['configs.device_name'] = 'bioplux'
df_selfacq['configs.frequency'] = 1000
df_selfacq['configs.signal'] = 'chest'
df_selfacq['configs.window_slicing_method'] = 'time_related'
df_selfacq['configs.window_size'] = 5.0

In [10]:
df_selfacq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2206350 entries, 0 to 2206349
Data columns (total 12 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   timestamp_idx                  object 
 1   ecg                            float64
 2   label                          object 
 3   sample_id                      object 
 4   subject_id                     object 
 5   supervisor                     object 
 6   record_date                    object 
 7   configs.device_name            object 
 8   configs.frequency              int64  
 9   configs.signal                 object 
 10  configs.window_slicing_method  object 
 11  configs.window_size            float64
dtypes: float64(2), int64(1), object(9)
memory usage: 202.0+ MB


# Pandas DataFrame <-> Pydantic Model JSON Input Format

In [None]:
# load json example
ecg_batch = json.loads("""{
                "supervisor": "Lieschen Mueller",
                "record_date": "2034-01-16",
                "configs": {
                    "device_name": "bioplux",
                    "frequency": 360,
                    "signal": "chest",
                    "window_slicing_method": "time_related",
                    "window_size": 5.0
                },
                "samples": [
                    {
                      "sample_id": "f70c1033-36ae-4b8b-8b89-099a96dccca5",
                      "subject_id": "participant_1",
                      "timestamp_idx": [1679709871, 1679713471, 1679720671, 1679724071, 1679727471, 1679730871, 1679734271, 1679737671, 1679741071, 1679744471, 1679747871, 1679751271, 1679754671],
                      "ecg": [1.0, -1.100878, -3.996840, 0.5, -2.345, 1.234, -0.987, 2.345, -1.234, 0.5, 1.0, 1.0, 1.0],
                      "label": ["undefined", "stress", "undefined", "undefined", "calm", "undefined", "stress", "calm", "undefined", "stress", "undefined", "undefined", "undefined"]
                    },
                    {
                      "sample_id": "sample_id_2",
                      "subject_id": "participant_2",
                      "timestamp_idx": [1679709871, 1679713471, 1679720671, 1679724071, 1679727471, 1679730871, 1679734271, 1679737671, 1679741071, 1679744471, 1679747871, 1679751271, 1679754671],
                      "ecg": [1.2, -1.567, -3.456, 0.7, -2.789, 1.876, -0.345, 2.567, -1.876, 0.7, 1.0, 1.0, 1.0],
                      "label": ["undefined", "stress", "undefined", "calm", "undefined", "stress", "calm", "undefined", "stress", "undefined", "undefined", "undefined", "undefined"]
                    },
                    {
                      "sample_id": "sample_id_3",
                      "subject_id": "participant_3",
                      "timestamp_idx": [1679709871, 1679713471, 1679720671, 1679724071, 1679727471, 1679730871, 1679734271, 1679737671, 1679741071, 1679744471, 1679747871, 1679751271, 1679754671],
                      "ecg": [0.8, -1.234, -3.789, 0.9, -2.567, 1.456, -0.234, 2.789, -1.456, 0.9, 1.0, 1.0, 1.0],
                      "label": ["undefined", "calm", "stress", "undefined", "stress", "undefined", "calm", "undefined", "stress", "undefined", "undefined", "undefined", "undefined"]
                    }
                  ]
            }""")

### Json to Pandas

In [None]:
# to pandas
df = pd.json_normalize(ecg_batch, record_path=['samples'], meta=['supervisor', 'record_date', ['configs', 'device_name'], ['configs', 'frequency'], ['configs', 'signal'], ['configs', 'window_slicing_method'], ['configs', 'window_size']])

In [None]:
df

In [None]:
# explode every list colmn to rows
df_exploded = df.explode(['timestamp_idx', 'ecg', 'label'])

In [None]:
df_exploded

In [None]:
# make function from json to pandas
# possible To-DO: auto fill meta characters by looking trough json file
def json_to_pandas(json_data, meta=None):
    """
    Convert JSON data to a Pandas DataFrame.

    This function takes a JSON data object and performs transformations to convert it into a structured Pandas DataFrame. The input JSON data should have a nested structure containing a list of samples, along with additional metadata such as supervisor, record_date and configs.frequency. More metadata can be added by extending the meta parameter.

    :json_data: JSON data object to convert to Pandas DataFrame.
    :type json_data: dict
    :meta: Additional metadata to include in the DataFrame, defaults to None. The meta parameter should be a list of strings, where each string is a path to a nested key in the JSON data object.
    :type meta: list, optional

    :return: Pandas DataFrame containing the JSON data.
    :rtype: pandas.DataFrame
    """
    df = pd.json_normalize(json_data, record_path=['samples'], meta=['supervisor', 'record_date', ['configs', 'frequency']] + meta)
    df_exploded = df.explode(['timestamp_idx', 'ecg', 'label'])

    return df_exploded

In [None]:
# test function
df_test = json_to_pandas(ecg_batch, meta=[['configs', 'device_name'], ['configs', 'signal'], ['configs', 'window_slicing_method'], ['configs', 'window_size']])

In [None]:
df_test

### Pandas to JSON

In [None]:
# inverse pandas explode
exploded_cols = ['timestamp_idx', 'ecg', 'label']
df_imploded = df_exploded.groupby(list(set(df_exploded.columns) - set(exploded_cols))) \
    .agg({'timestamp_idx': list,
          'ecg': list,
          'label': list}) \
    .reset_index()

In [None]:
df_imploded

In [None]:
# get configs dict
config_cols = df_imploded.columns[df_imploded.columns.str.startswith('configs')]
configs_dict = df_imploded[config_cols].rename(columns=lambda x: x.removeprefix('configs.')).to_dict('records')[0]

In [None]:
configs_dict

In [None]:
# get sample dicts
sample_cols = ['sample_id', 'subject_id', 'timestamp_idx', 'ecg', 'label']
samples = df_imploded[sample_cols].to_dict('records')

In [None]:
samples

In [None]:
# create json dict
json_dict = {
    'supervisor': df_imploded['supervisor'].iloc[0],
    'record_date': df_imploded['record_date'].iloc[0],
    'configs': configs_dict,
    'samples': samples
}

In [None]:
# dict to json and save
with open('../data/example0_input.json', 'w') as file:
    json.dump(json_dict, file, indent=4)

In [None]:
# make function from pandas to json
def pandas_to_json(df_exploded, save_path=None):
    """
    Convert a Pandas DataFrame to JSON data dict.

    This function takes a Pandas DataFrame and performs transformations to convert it into a JSON data dict.

    :param df_exploded: Pandas DataFrame to convert to JSON data dict.
    :type df_exploded: pandas.DataFrame
    :param save_path: Path to save the JSON data dict to, defaults to None. If None, the JSON data dict will not be saved.
    :type save_path: str, optional

    :return: JSON data dict containing the Pandas DataFrame.
    :rtype: dict
    """
    # inverse pandas explode
    exploded_cols = ['timestamp_idx', 'ecg', 'label']
    df_imploded = df_exploded.groupby(list(set(df_exploded.columns) - set(exploded_cols))) \
        .agg({'timestamp_idx': list,
              'ecg': list,
              'label': list}) \
        .reset_index()

    # get configs dict
    config_cols = df_imploded.columns[df_imploded.columns.str.startswith('configs')]
    configs_dict = df_imploded[config_cols].rename(columns=lambda x: x.removeprefix('configs.')).to_dict('records')[0]

    # get sample dicts
    sample_cols = ['sample_id', 'subject_id', 'timestamp_idx', 'ecg', 'label']
    samples = df_imploded[sample_cols].to_dict('records')

    # create json dict
    json_dict = {
        'supervisor': df_imploded['supervisor'].iloc[0],
        'record_date': df_imploded['record_date'].iloc[0],
        'configs': configs_dict,
        'samples': samples
    }

    if save_path:
        # dict to json and save
        with open(save_path, 'w') as file:
            json.dump(json_dict, file, indent=4)

    return json_dict

In [None]:
# test pandas to json fucntion
json_test = pandas_to_json(df_exploded, '../data/example0_input_test.json')

# Pandas DataFrame <-> CSV Input Format

# Build proper example files
Given the input of a h5 file we are building a json file as well as a csv file. These files contain a formatted version of the self acquired ecg data of the h5 file. The h5 file is the direct output of the ecg acquisition device with it`s corresponding software, opensignals. As input both take a pandas dataframe build from the h5 file as can be seen in the first chapter of this notebook (Pandas DataFrame <-> H5 Input Format). The json file is build with the function pandas_to_json() and the csv file is build with the function pandas_to_csv(). Both functions are defined in the previous code cells.

In [11]:
len(df_selfacq)

2206350

In [14]:
# sample data, because it is too big for just a test
df = df_selfacq.iloc[1000000:1060000]
print(f'Length of {len(df)} with a sampling rate of 1000 represents a time of {len(df)/1000} seconds !')

Length of 60000 with a sampling rate of 1000 represents a time of 60.0 seconds !


In [None]:
df

In [None]:
# make json
json_test = pandas_to_json(df, '../data/example1_input.json')

In [None]:
# make csv
csv_df = df.drop(list(config_cols) + ['supervisor', 'record_date'], axis=1)
csv_test = csv_df.to_csv('../data/example1_input_without_header.csv', index=False)

# Window Slicing and Processing

In [15]:
def create_windows(df, time_column, window_size=5.0, window_slicing_method='time_related'):
    """
    Slices a dataframe into windows of a given size. The windows can be sliced in different ways. The windows are returned as a generator of dataframes. The dataframe must have a column containing timestamps and be indexed by it.

    :param df: The dataframe to slice.
    :type df: pandas.DataFrame
    :param time_column: The name of the column containing the timestamps.
    :type time_column: str
    :param window_size: The size of the windows in seconds.
    :type window_size: int
    :param window_slicing_method: The method used to slice the windows.
    :type window_slicing_method: str

    :return: A generator of dataframes containing the windows.
    :rtype: generator
    """
    # Convert the timestamp column to datetime if it's not already
    if not pd.api.types.is_datetime64_ns_dtype(df[time_column]):
        df[time_column] = pd.to_datetime(df[time_column])

    # Slice the dataframe into windows
    if window_slicing_method == 'time_related':
        # Resample the dataframe every x seconds
        result_dfs = [group for _, group in df.groupby(pd.Grouper(key=time_column, freq=f'{window_size}S'))]
        return result_dfs
    elif window_slicing_method == 'label_related_before':
        pass
    elif window_slicing_method == 'label_related_after':
        pass
    elif window_slicing_method == 'label_related_centered':
        pass
    else:
        raise ValueError(f'window_slicing_method {window_slicing_method} not supported')

In [16]:
df

Unnamed: 0,timestamp_idx,ecg,label,sample_id,subject_id,supervisor,record_date,configs.device_name,configs.frequency,configs.signal,configs.window_slicing_method,configs.window_size
1000000,2023-05-23 23:19:41.335,1500.0,undefined,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,Lieschen Mueller,2034-01-16,bioplux,1000,chest,time_related,5.0
1000001,2023-05-23 23:19:41.336,4405.0,undefined,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,Lieschen Mueller,2034-01-16,bioplux,1000,chest,time_related,5.0
1000002,2023-05-23 23:19:41.337,3072.0,undefined,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,Lieschen Mueller,2034-01-16,bioplux,1000,chest,time_related,5.0
1000003,2023-05-23 23:19:41.338,1405.0,undefined,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,Lieschen Mueller,2034-01-16,bioplux,1000,chest,time_related,5.0
1000004,2023-05-23 23:19:41.339,838.0,undefined,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,Lieschen Mueller,2034-01-16,bioplux,1000,chest,time_related,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1059995,2023-05-23 23:20:41.330,2771.0,undefined,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,Lieschen Mueller,2034-01-16,bioplux,1000,chest,time_related,5.0
1059996,2023-05-23 23:20:41.331,2040.0,undefined,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,Lieschen Mueller,2034-01-16,bioplux,1000,chest,time_related,5.0
1059997,2023-05-23 23:20:41.332,1960.0,undefined,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,Lieschen Mueller,2034-01-16,bioplux,1000,chest,time_related,5.0
1059998,2023-05-23 23:20:41.333,1861.0,undefined,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,Lieschen Mueller,2034-01-16,bioplux,1000,chest,time_related,5.0


In [17]:
%%time
# preprocess ecg
df['ecg'] = nk.ecg_clean(df['ecg'], sampling_rate=1000, method="pantompkins1985")
# slice in windows (window_size and window_slicing_method)
windows = create_windows(df, time_column='timestamp_idx', window_size=5.0)
print(f'Number of windows: {len(list(windows))}')
# compute ecg features vor each window
features_df = pd.DataFrame()
for i, window in enumerate(windows):
    # compute features
    features = fe.hrv_features(window['ecg'].values, sampling_rate)
    tmp = pd.DataFrame(features, index=[0])
    tmp['sample_id'] = df['sample_id'].unique()
    tmp['subject_id'] = df['subject_id'].unique()
    tmp['window_id'] = i
    tmp['w_start_time'] = window['timestamp_idx'].min()
    tmp['W_end_time'] = window['timestamp_idx'].max()
    # combine features
    features_df = pd.concat([features_df, tmp], axis=0)
    features_df.reset_index(drop=True, inplace=True)

Number of windows: 13
CPU times: total: 93.8 ms
Wall time: 246 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[time_column] = pd.to_datetime(df[time_column])
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio = lf_band / hf_band
  lf_hf_ratio 

In [18]:
features_df

Unnamed: 0,mean_rr,sdnn,rmssd,nn50,pnn50,lf_band,hf_band,lf_hf_ratio,sample_id,subject_id,window_id,w_start_time,W_end_time
0,0.6426,0.186896,0.336774,3,0.6,0.0,0.0,,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,0,2023-05-23 23:19:41.335,2023-05-23 23:19:44.999
1,0.34825,0.153873,0.270766,7,0.583333,0.0,0.0,,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,1,2023-05-23 23:19:45.000,2023-05-23 23:19:49.999
2,0.392545,0.117491,0.169679,8,0.727273,0.0,0.0,,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,2,2023-05-23 23:19:50.000,2023-05-23 23:19:54.999
3,0.4494,0.163658,0.184729,8,0.8,0.0,0.0,,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,3,2023-05-23 23:19:55.000,2023-05-23 23:19:59.999
4,0.36775,0.165213,0.271478,11,0.916667,0.0,0.0,,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,4,2023-05-23 23:20:00.000,2023-05-23 23:20:04.999
5,0.401909,0.134468,0.219575,8,0.727273,0.0,0.0,,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,5,2023-05-23 23:20:05.000,2023-05-23 23:20:09.999
6,0.294875,0.131194,0.200083,12,0.75,0.0,0.0,,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,6,2023-05-23 23:20:10.000,2023-05-23 23:20:14.999
7,0.398182,0.183843,0.247847,9,0.818182,0.0,0.0,,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,7,2023-05-23 23:20:15.000,2023-05-23 23:20:19.999
8,0.372462,0.149817,0.188498,8,0.615385,0.0,0.0,,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,8,2023-05-23 23:20:20.000,2023-05-23 23:20:24.999
9,0.4393,0.13927,0.188644,9,0.9,0.0,0.0,,f70c1033-36ae-4b8b-8b89-099a96dccca5,participant_1,9,2023-05-23 23:20:25.000,2023-05-23 23:20:29.999


In [None]:
features_df.to_json('../data/example1_output.json', orient='records', lines=True, indent=4)