# Exploring .icmh5 File Structure

### Useful Imports and Preliminaries

In [None]:
import os
import sys
sys.path.append("..")  # add project root

import h5py

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from tqdm import tqdm

from src.data_utils import *

# pending useful sklearn imports

In [None]:
pd.options.display.float_format = '{:10,.2f}'.format

In [None]:
np.random.seed(420)

In [None]:
sns.set_theme(context="talk")

In [None]:
# path constants
data_dir = "/home/mr2238/project_pi_np442/mr2238/accelerate/data"
img_dir = "/home/mr2238/project_pi_np442/mr2238/accelerate/imgs/overview"
labels_path = os.path.join(data_dir, "labels")
raw_data_path = os.path.join(data_dir, "raw_data")

In [None]:
# list files
h5py_files = [f for f in os.listdir(raw_data_path) if f.endswith(".icmh5")]
print(f"Number of h5py files: {len(h5py_files)}")
print(f"Example file: {h5py_files[0]}")

### Summarize a random file

Here we explore the structure and data series that compose one recording.

In [None]:
# load single random example
idx = np.random.randint(0, len(h5py_files))
example_file = h5py_files[idx]

In [None]:
print(example_file)

In [None]:
# extract invalid value and numerics/waveforms
with h5py.File(os.path.join(raw_data_path, example_file), "r") as f:
    invalid_value = float(f.attrs["invalidValue"][0])
    print(f"Invalid value: {invalid_value}")
    numerics = list(f["numerics"].keys())
    waves = list(f["waves"].keys())
    print(f"Numerics: {numerics}")
    print(f"Waves: {waves}")


In [None]:
# summarize random example file
print(f"Summarizing example file {example_file}:")
h5py_summarize(os.path.join(raw_data_path, example_file))


In [None]:
with h5py.File(os.path.join(raw_data_path, example_file), "r") as f:
    print(pd.DataFrame(f["definitions/qualityRef"][:]))

Now here we summarize the various data series to observe units and distributions:

In [None]:
# summarize numerics and waveforms
def summarize_series(name, obj, invalid_value=invalid_value):
    print(f"Dataset: {name}")
    df = pd.DataFrame(obj[:])
    df.replace(invalid_value, np.nan, inplace=True)
    print(f"Number of missing values: {df.isna().sum().sum()}")
    print(df.describe())
    print("\n")
    return

print(f"Summarizing statistics for numerics and waveforms in file {example_file}:")
with h5py.File(os.path.join(raw_data_path, example_file), "r") as f:
    f["numerics"].visititems(summarize_series)
    f["waves"].visititems(summarize_series)

### Plot a random file

Here we plot the various data series of a random file to observe their dynamics. 

#### Timeseries data

First we can just naively plot a data series without considering data gaps.

In [None]:
with h5py.File(os.path.join(raw_data_path, example_file), "r") as f:
    df = pd.DataFrame(f["numerics/hr"])
    df.replace(invalid_value, np.nan, inplace=True)

fig, ax = plt.subplots(figsize=(12,6))
ax.scatter(df.index/(60*60), df[0], label = "Heart Rate", s=0.5)
ax.set_xlabel("Time (hr)")
ax.set_ylabel("Heart Rate (bpm)")
ax.set_title("Heart Rate over Time")
ax.set_ylim(25, 200)
img_name = f"hr_series_{example_file.removesuffix('.icmh5')}_nogaps.png"
plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')
plt.show()
plt.close()

However, there are gaps in this data, as seen in the ``index`` attribute of the file.

In [None]:
# Plot continuous time series with gaps as NaNs
with h5py.File(os.path.join(raw_data_path, example_file), "r") as f:
    df = build_continuous_time(f, 'numerics/hr')


fig, ax = plt.subplots(figsize=(12,6))
ax.scatter(df.index/(60*60), df[0], label = "Heart Rate", s=0.5)
ax.set_xlabel("Time (hr)")
ax.set_ylabel("Heart Rate (bpm)")
ax.set_title("Heart Rate over Time with recording gaps")
ax.set_ylim(25, 200)

img_name = f"hr_series_{example_file.removesuffix('.icmh5')}_full.png"
fig.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')
fig.show()
# plt.close()

#### Distribution of recording variables

Next we can investigate the general distribution of all the variables of a random recording.

In [None]:
# can do histograms for all numerics
with h5py.File(os.path.join(raw_data_path, example_file), "r") as f:
    grp = f["numerics"]
    nrows = len(numerics)//2 + 1*(len(numerics)%2)
    fig, axs = plt.subplots(nrows = 2, ncols = nrows, layout='constrained', figsize=(5* (len(numerics)//2 + 1), 8))
    for i, n in enumerate(numerics):
        df = pd.DataFrame(grp[n])
        df.replace(invalid_value, np.nan, inplace=True)
        ax = axs[i%2, i//2]
        if n == "t":
            ax.hist(df[0], bins="auto", density=True, label="Distribution")
        else:
            sns.kdeplot(df[0], ax=ax, fill=True, bw_adjust=3, label="Distribution")
        if n == "spo2":
            ax.set_xlim(70, 110)
        ax.set_ylabel("Density")
        ax.set_xlabel("Value")
        ax.set_title(n + f" Distribution (Invalid Values = {df[0].isna().sum()/ len(df)*100:.3f}%)")
        mean_value = df[0].mean()
        ax.axvline(x=mean_value, color='red', linestyle='--', label=f'Mean: {mean_value:.2f}')
        ax.legend()
    
fig.suptitle(f"Histogram of numeric values for file {example_file}")
img_name = f"numerics_hist_{example_file.removesuffix('.icmh5')}.png"
plt.savefig(os.path.join(img_dir, img_name))
plt.show()
plt.close()

In [None]:
print(waves)
waves.remove("ecg.ii") # it does not make sense to plot distribution of ECG voltages

In [None]:
# can do histograms for all waves
with h5py.File(os.path.join(raw_data_path, example_file), "r") as f:
    grp = f["waves"]
    nrows = 4
    ncols = len(waves)//nrows + 1*(len(waves)%nrows)
    fig, axs = plt.subplots(nrows = nrows, ncols = ncols, layout='constrained', figsize=(8*ncols, 4*nrows))
    for i, n in tqdm(enumerate(waves)):
        df = pd.DataFrame(grp[n])
        df.replace(invalid_value, np.nan, inplace=True)
        ax = axs[i%nrows, i//nrows]
        if n == "cvp" or n == "pleth" or n == "icp":
            ax.hist(df[0], bins="auto", density=True, label="Distribution")
        else:
            sns.kdeplot(df[0], ax=ax, fill=True, bw_adjust=3, label="Distribution")
        if n == "spo2":
            ax.set_xlim(70, 110)
        ax.set_ylabel("Density")
        ax.set_xlabel("Value")
        ax.set_title(n + f" Distribution (Invalid Values = {df[0].isna().sum()/ len(df)*100:.3f}%)")
        mean_value = df[0].mean()
        ax.axvline(x=mean_value, color='red', linestyle='--', label=f'Mean: {mean_value:.2f}')
        ax.legend()

for i, a in enumerate(axs.flat):
    if i > len(waves) - 1:
        a.set_axis_off()

fig.suptitle(f"Histogram of waves values for file {example_file}")
img_name = f"waves_hist_{example_file.removesuffix('.icmh5')}.png"
plt.savefig(os.path.join(img_dir, img_name))
plt.show()
plt.close()

### Database-Wide Statistics

In this section, we plot some database-wide statistics to obtain summaries of the whole dataset.

#### Variables recorded per patient by count

Not every patient has the same variables recorded. To select the best ones, we need some statistics on which proportion of patients have which variables.

In [None]:
from collections import Counter

In [None]:
# find variables of patients
recorded_vars = Counter()
for i, file in tqdm(enumerate(h5py_files)):
    with h5py.File(os.path.join(raw_data_path, file), "r") as f:
        if invalid_value != float(f.attrs["invalidValue"][0]):
            print("Invalid value is different, problem")
            print(f"Invalid value: {float(f.attrs['invalidValue'][0])}")
        
        numerics = list(f["numerics"].keys())
        waves = list(f["waves"].keys())
        f_counter = Counter(numerics) + Counter(waves)
        recorded_vars += f_counter

vars_per_dataset = pd.Series(dict(recorded_vars))


In [None]:
vars_per_dataset.sort_values(inplace=True, ascending=False)

In [None]:
with plt.rc_context({'ytick.left': True, 'xtick.bottom': True}):
    fig, ax = plt.subplots(figsize=(8,12))
    
    sns.barplot(data=vars_per_dataset / len(h5py_files) * 100, orient="h")
    
    ax.set_title('Variables Recorded per Patient')
    
    ax.set_ylabel('Variable')
    ax.tick_params(axis='y', labelsize=14)

    ax.set_xlabel('Percentage of Patients with Recorded Data')
    ax.set_xticks(np.arange(0, 101, 5), minor=True)
    ax.set_xlim(0, 100)

    img_name = f"variable_coverage.png"
    plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')

    plt.show()

In [None]:
vars_per_dataset.index

#### Durations with and without gaps

In [None]:
# loop through all the files and extract duration in seconds with gaps
durations_gaps = {}

for i, file in tqdm(enumerate(h5py_files)):
    key = [s.split(".") for s in file.split("_")][0][0]

    with h5py.File(os.path.join(raw_data_path, file), "r") as f:
        # duration with gaps
        time = int(f.attrs["duration"][0].replace(" seconds", ""))
        durations_gaps[key] = time + durations_gaps.get(key, 0)

In [None]:
# loop through all the files and extract duration in seconds without gaps
durations_no_gaps = {}

for i, file in tqdm(enumerate(h5py_files)):
    key = [s.split(".") for s in file.split("_")][0][0]

    with h5py.File(os.path.join(raw_data_path, file), "r") as f:
        # duration without gaps
        try:
            index = pd.DataFrame(f["numerics/hr"].attrs["index"])
            time_per_segment = (index["length"]).astype('float64')/index["frequency"]
            time = time_per_segment.sum().item()
            durations_no_gaps[key] = time + durations_no_gaps.get(key, 0)
        except:
            print(f"numerics/hr not found in {file}")

In [None]:
# plot violin plots of durations with and without gaps

df1 = pd.DataFrame({'Value': np.array(list(durations_no_gaps.values()))/(60*60), 'Group': 'Excluding Gaps'})
df2 = pd.DataFrame({'Value': np.array(list(durations_gaps.values()))/(60*60), 'Group': 'Including Gaps'})

# Concatenate the two DataFrames
combined_df = pd.concat([df1, df2])
medians = combined_df.groupby(['Group'])['Value'].median()

with plt.rc_context({'ytick.left': True}):
        with sns.axes_style("darkgrid"):
                fig, ax = plt.subplots(figsize=(12,6))
                sns.violinplot(x='Group', y='Value', data=combined_df, ax=ax, hue="Group", palette="pastel")
                ax.set_title('Side-by-Side Violinplots of Dataset Durations')
                ax.set_ylabel('Duration (hours)')
                ax.set_xlabel('')
                ax.set_yticks(np.arange(0, 251, 25), minor=True)
                ax.set_ylim(0, 250)

                for i, v in enumerate(medians):
                        ax.text((i+0.025), (v-2), str(round(v, 2)), fontsize = 12)

                img_name = f"duration_distributions.png"
                plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')
                plt.show()

#### Global distributions of all raw variables

In [None]:
# need to calculate bin edges on all data of a specific var
histograms = {"waves" : {}, "numerics" : {}}

with h5py.File(os.path.join(raw_data_path, file), "r") as f:        
    for var_type in list(histograms.keys()):
        db_vars = f[var_type]
        for i, n in enumerate(db_vars):

In [None]:
# calculate histograms for all ptids and combine them
histograms = {"waves" : {}, "numerics" : {}}

with h5py.File(os.path.join(raw_data_path, file), "r") as f:        
    for var_type in list(histograms.keys()):
        db_vars = f[var_type]
        for i, n in enumerate(db_vars):
            


# loop through all the files and extract duration in seconds with gaps
durations_gaps = {}

for i, file in tqdm(enumerate(h5py_files)):
    key = [s.split(".") for s in file.split("_")][0][0]

    with h5py.File(os.path.join(raw_data_path, file), "r") as f:
        # duration with gaps
        time = int(f.attrs["duration"][0].replace(" seconds", ""))
        durations_gaps[key] = time + durations_gaps.get(key, 0)


with h5py.File(os.path.join(raw_data_path, example_file), "r") as f:
    grp = f["numerics"]
    nrows = len(numerics)//2 + 1*(len(numerics)%2)
    fig, axs = plt.subplots(nrows = 2, ncols = nrows, layout='constrained', figsize=(5* (len(numerics)//2 + 1), 8))
    for i, n in enumerate(numerics):
        df = pd.DataFrame(grp[n])
        df.replace(invalid_value, np.nan, inplace=True)
        ax = axs[i%2, i//2]
        if n == "t":
            ax.hist(df[0], bins="auto", density=True, label="Distribution")
        else:
            sns.kdeplot(df[0], ax=ax, fill=True, bw_adjust=3, label="Distribution")
        if n == "spo2":
            ax.set_xlim(70, 110)
        ax.set_ylabel("Density")
        ax.set_xlabel("Value")
        ax.set_title(n + f" Distribution (Invalid Values = {df[0].isna().sum()/ len(df)*100:.3f}%)")
        mean_value = df[0].mean()
        ax.axvline(x=mean_value, color='red', linestyle='--', label=f'Mean: {mean_value:.2f}')
        ax.legend()
    
fig.suptitle(f"Histogram of numeric values for file {example_file}")
img_name = f"numerics_hist_{example_file.removesuffix('.icmh5')}.png"
plt.savefig(os.path.join(img_dir, img_name))
plt.show()
plt.close()


In [None]:
# graph each variable

### Labels

Now, let us explore the label dataset.

In [None]:
# find all patient ids for whom I have labels
# ptid_list = set()
# for i, file in enumerate(h5py_files):
#     ptid = [s.split(".") for s in file.split("_")][0][0]
#     ptid_list.add(ptid)

# ptid_list = list(ptid_list)
ptid_list = list({fpath.split("_")[0]: fpath for fpath in os.listdir(labels_path) if ".csv" in fpath}.keys())

In [None]:
# random label file
r_id = ptid_list[np.random.randint(0, len(ptid_list))]
df = load_label(r_id, labels_path=labels_path, time="seconds")
df.head()

In [None]:
# load df, find seconds elapsed to calculate opt MAP
calc = ["MAPopt_Yale_affected_beta", "LLA_Yale_affected_beta", "ULA_Yale_affected_beta"]
elapsed_times = pd.Series([find_time_elapsed(pt, calc, labels_path) for pt in tqdm(ptid_list)]).dropna()
print(elapsed_times.describe())


In [None]:
with_data = elapsed_times.shape[0]
all_pt = len(ptid_list)

In [None]:
print(elapsed_times.shape)

In [None]:
with plt.rc_context({'xtick.bottom': True}):
    fig, ax = plt.subplots(figsize=(12,6))
    sns.histplot(data=pd.Series(elapsed_times)/(60*60), ax=ax, kde=True, stat="density", kde_kws={'bw_adjust': 0.4})
    ax.set_title(f"Time required to calculate limits of autoregulation (calculated for {with_data}/{all_pt} patients)")
    ax.set_xlabel('Time (hours)')
    ax.set_ylabel("Density")
    ax.set_xticks(np.arange(0, 20, 0.5), minor=True)
    ax.set_xlim(0, 20)

    mean_value = (pd.Series(elapsed_times)/(60*60)).mean()
    ax.axvline(x=mean_value, color='red', linestyle='--', label=f'Mean: {mean_value:.2f} hours')
    ax.legend()

    med_value = (pd.Series(elapsed_times)/(60*60)).median()
    ax.axvline(x=med_value, color='green', linestyle='--', label=f'Median: {med_value:.2f} hours')
    ax.legend()
    


    img_name = f"mapopt_calc_distributions.png"
    plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')
    plt.show()

### Autoregulation status

Here we explore how much a given patient is within their autoregulatory limits, to get a sense of how to distribute this.