In [None]:
import h5py
import pandas as pd
import numpy as np
import os
import datetime
from tqdm import tqdm

In [None]:
! pip install nb_black

In [None]:
%load_ext lab_black

In [None]:
where_dict = {}
where_dict["quanta"] = "/storage/shared/ecg/mgh/"
LEADS = ["I", "II", "III", "aVR", "aVL", "aVF", "V1", "V2", "V3", "V4", "V5", "V6"]


def get_ecg(name, path=True, date=False, machine="quanta"):
    if path:
        full_loc = name
    else:
        # go to hd5 on quanta
        try:
            base_loc = where_dict[machine]
        except KeyError:
            raise Exception(
                "Please enter a valid machine id, which include: "
                + str(list(where_dict))
            )
        full_loc = base_loc + str(name) + ".hd5"

    with h5py.File(full_loc, "r") as f:
        if "ecg" not in f:
            print(f"{full_loc} lacks ECG key in HD5!")
            voltage = np.nan
        else:
            date_list = list(f["ecg"].keys())
            dtime_list = [
                datetime.datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in date_list
            ]
            d_list = [d.strftime("%Y-%m-%d") for d in dtime_list]
            # print([d==date for d in d_list])
            # print(d_list)
            if date:
                which_dates = np.where(np.array(d_list) == date)
                # print(which_dates)
                which_date = which_dates[0]  # TODO pick best ECG, or look at them all
                # print(which_date)
                if not len(which_date):
                    raise KeyError("ECG on the given date not found.")
                which_date = which_date[0]
            else:
                which_date = np.argmax(dtime_list)
            test_lead = f["ecg"][date_list[which_date]][LEADS[0]][()]
            voltage = np.zeros((test_lead.shape[0], len(LEADS)))
            for i in range(len(LEADS)):
                voltage[:, i] = f["ecg"][date_list[which_date]][LEADS[i]][()]
    return voltage

In [None]:
# Get list of paths to all MGH ECG HD5 files
fpaths = []
for root, dirs, files in os.walk("/storage/shared/ecg/mgh/"):
    for file in tqdm(files):
        if file.endswith(".hd5"):
            fpath = os.path.join(root, file)
            fpaths.append(fpath)

In [None]:
voltage_maximums = []
for fpath in tqdm(fpaths[0:100000]):
    voltage = get_ecg(name=fpath, path=True, date=False, machine="quanta")
    voltage_maximums.append(np.max(voltage))
voltage_maximums = np.array(voltage_maximums)

In [None]:
import matplotlib.pyplot as plt

max_voltage = 3000
plt.hist(voltage_maximums, bins=500)
plt.xlim([0, max_voltage])

In [None]:
ecgs_above_max = len(voltage_maximums[voltage_maximums > max_voltage])
ecgs_above_max_fraction = ecgs_above_max / len(voltage_maximums)
print(
    f"{ecgs_above_max} ({ecgs_above_max_fraction * 100}% of {len(voltage_maximums)}) ECGs have a max voltage amplitude >{max_voltage}"
)