## Limits of Autoregulation - Summary Statistics

In this notebook we look at the characteristics of the physiologic data when patients move from inside to outside the limits of autoregulation.

#### Useful Imports and Preliminaries

In [None]:
import os
import sys
sys.path.append("..")  # add project root

import h5py

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from tqdm import tqdm

from src.data_utils import *
from src.constants import *

# pending useful sklearn imports
from scipy.stats import pearsonr

In [None]:
pd.options.display.float_format = '{:10,.2f}'.format

In [None]:
np.random.seed(420)

In [None]:
sns.set_theme(context="talk")

In [None]:
# path constants
data_dir = "/home/mr2238/project_pi_np442/mr2238/accelerate/data"
img_dir = "/home/mr2238/project_pi_np442/mr2238/accelerate/imgs/in_out"
global_path = "/home/mr2238/project_pi_np442/mr2238/accelerate/data/processed/all_data.hdf5"
labels_path = os.path.join(data_dir, "labels")
raw_data_path = os.path.join(data_dir, "raw_data")

In [None]:
# check img directory exists, if not make it
os.makedirs(img_dir, exist_ok=True)

In [None]:
# list files
h5py_files = [f for f in os.listdir(raw_data_path) if f.endswith(".icmh5")]
print(f"Number of h5py files: {len(h5py_files)}")
print(f"Example file: {h5py_files[0]}")

### In and Out Statistics

#### Define Limits

Here we explore how much a given patient is within their autoregulatory limits, to get a sense of how to distribute this.
1) Definition: Patient is outside limits of autoregulation if at a given time *t*, the mean ABP value over the minute prior to time *t* is outside the limits calculated at *t*.
2) What do we want?
    - Distribution of % time outside limits: Done
    - Duration of time outside limits: per patient, distribution, median, average
    - Number of times outside limits per patient
    - Correlation between time elapsed and likelihood of being outside of limits: t vs. at time t, what fraction of patients are outside limits?
    - time spent outside autoregulation vs time to autoregulation calc

#### Percent Time In and Out

In [None]:
true_false = {}
ins = []
mode = 'mean'
with h5py.File(global_path, "r") as f:
    for pt in f:
        if f[f"{pt}/processed/in_out_{mode}"].attrs["no_label_overlap"]:
            continue
        in_out_df = pd.Series(f[f"{pt}/processed/in_out_{mode}/in_out"][...])
        idx_window = pd.DataFrame(f[f"{pt}/processed/in_out_{mode}/window_idx"][...])

        # to actually get percentage of time spent outside autoregulation, we need to get actual window length, we can't weigh all the windows equally
        len_window = idx_window.iloc[:, 1] - idx_window.iloc[:, 0]
        in_out = (in_out_df * len_window).sum() / len_window.sum()
        if len_window.sum() == 0:
            print(pt)
            print(idx_window)
            print(in_out_df)

        true_false[pt] = [in_out]
        ins.append(in_out_df)


In [None]:
true_false_df = pd.DataFrame(true_false).transpose()
bool_in_out_df = pd.concat(ins)
bool_in_out_df.describe()

In [None]:
# calculate percent time in/out per patient
percent_time = 1 - np.array(true_false_df).squeeze()
print(percent_time.shape)

In [None]:
# plot distribtion
with plt.rc_context({'xtick.bottom': True, 'ytick.left': True}):
    fig, ax = plt.subplots(figsize=(12,6))
    fig.tight_layout(pad=2)
    sns.histplot(percent_time * 100, ax=ax, stat="probability", edgecolor=(0, 0, 0, 0.5), alpha=0.5)
    # sns.kdeplot(
    #     percent_time / percent_time.sum(),
    #     ax=ax,
    #     bw_adjust=1,
    #     linewidth=2,
    #     cut=0,
    # )
    ax.set_title(f"Percentage of time spent outside autoregulatory limits ({len(percent_time)} files, 189 patients)")
    ax.set_xlabel('Percentage of Time Outside Limits')
    ax.set_ylabel("Density")
    ax.set_ylim(0, 0.25)
    ax.set_xticks(np.arange(0, 1, 0.1), minor=True)

    mean_value = pd.Series(percent_time).mean()
    ax.axvline(x=mean_value * 100, color='red', linestyle='--', label=f'Mean: {mean_value * 100:.0f}%')
    ax.legend()

    med_value = pd.Series(percent_time).median()
    ax.axvline(x=med_value * 100, color='green', linestyle='--', label=f'Median: {med_value * 100:.0f}%')
    ax.legend()

    img_name = f"in_out_percent_distribution.png"
    plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')
    plt.show()

Now, we can correlate this with the time required to calculate MAP opt.

In [None]:
# load df, find seconds elapsed to calculate opt MAP
with h5py.File(global_path, "r") as f:
    calc = TARGETS
    e = [[find_time_elapsed(str(pt).split("_")[0], calc, labels_path), str(pt)] for pt in f if not f[f"{pt}/processed/in_out_{mode}"].attrs["no_label_overlap"] ]
    elapsed_times= pd.concat([pd.Series([t[0] for t in e]), pd.Series([str(t[1]) for t in e])], axis=1)
    elapsed_times_idx = elapsed_times.set_index(1).dropna()
    print(elapsed_times_idx.describe())

In [None]:
# merge dfs
timevpercent = pd.merge(true_false_df, elapsed_times_idx, left_index=True, right_index=True)
timevpercent = timevpercent[timevpercent["0_y"] < 60000] # there is a single outlier that takes almost 80 hours to compute

In [None]:
# Example data
r, p = pearsonr(timevpercent["0_x"], np.log(timevpercent["0_y"]))

In [None]:
# plot correlation
with plt.rc_context({'xtick.bottom': True, 'ytick.left': True}):
    fig, ax = plt.subplots(figsize=(12,6))
    fig.tight_layout(pad=0)
    sns.regplot(x = timevpercent["0_x"]*100, y = np.log(timevpercent["0_y"]/(60*60)), scatter_kws={'alpha':0.6})
    ax.set_title(f"Percentage of Time Outside Autoregulation vs. Time to First MAPopt calculation\n({timevpercent.shape[0]} files, 189 patients)")
    ax.set_xlabel('Percentage of Time Outside Limits')
    ax.set_ylabel("Log Time to MAPopt calculation \n(log hr)")
    # ax.set_ylim(7.5, 11)
    # ax.set_xticks(np.arange(0, 1, 0.1), minor=True)

    # mean_value = pd.Series(percent_time).mean()
    # ax.axvline(x=mean_value * 100, color='red', linestyle='--', label=f'Mean: {mean_value * 100:.0f}%')
    # ax.legend()

    # med_value = pd.Series(percent_time).median()
    # ax.axvline(x=med_value * 100, color='green', linestyle='--', label=f'Median: {med_value * 100:.0f}%')
    # ax.legend()
    plt.text(0.05, 0.95, f"r = {r:.2f}", transform=plt.gca().transAxes, fontsize=12)

    # img_name = f"in_out_percent_distribution.png"
    # plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')
    plt.show()

Notice there is no correlation. Most likely due to the fact that time to MAPopt calc depends on a time period for which there are no autoregulatory limits.

We can also see how the likelihood of being outside the limits is correlated with time elapsed from start of recording.

In [None]:
# get in_out for each patient, add n/a if time index is not represented
# concat and plot
frames = []
with h5py.File(global_path, "r") as f:
    for pt in tqdm(f.keys()):
        processed_data_path = f"{pt}/processed/in_out_{mode}/"
        if f[processed_data_path].attrs["no_label_overlap"]:
            continue
        label_datetimes = f[processed_data_path + "label_timestamp"][...]
        zero_idx = label_datetimes - (f[f"{pt}/raw"].attrs["dataStartTimeUnix"]).astype(np.int64) * 1e6
        in_out_df = f[f"{pt}/processed/in_out_{mode}/in_out"][...]

        df = pd.DataFrame({
            "timestamp": zero_idx.astype(np.int64),
            f"in_out_{pt}": in_out_df
        })
        frames.append(df)

continuous_time_grid = pd.concat(frames, axis=0)
print(continuous_time_grid.shape)
grid_groups = continuous_time_grid.groupby("timestamp", as_index=False)
continuous_time_grid = grid_groups.first()
        

In [None]:
continuous_time_grid['timestamp']

In [None]:
to_graph = pd.DataFrame(continuous_time_grid['timestamp'])
to_graph['sum'] = continuous_time_grid.drop('timestamp', axis = 1).sum(axis=1)

In [None]:
to_graph['sum']

In [None]:
# plot correlation, not working yet.
with plt.rc_context({'xtick.bottom': True, 'ytick.left': True}):
    fig, ax = plt.subplots(figsize=(12,6))
    fig.tight_layout(pad=0)
    sns.histplot(x = to_graph['timestamp'], y = to_graph['sum'], stat="probability")
    ax.set_title(f"Percentage of Time Outside Autoregulation vs. Time to First MAPopt calculation\n({timevpercent.shape[0]} files, 189 patients)")
    ax.set_xlabel('Percentage of Time Outside Limits')
    ax.set_ylabel("Log Time to MAPopt calculation \n(log hr)")
    # ax.set_ylim(7.5, 11)
    # ax.set_xticks(np.arange(0, 1, 0.1), minor=True)

    # mean_value = pd.Series(percent_time).mean()
    # ax.axvline(x=mean_value * 100, color='red', linestyle='--', label=f'Mean: {mean_value * 100:.0f}%')
    # ax.legend()

    # med_value = pd.Series(percent_time).median()
    # ax.axvline(x=med_value * 100, color='green', linestyle='--', label=f'Median: {med_value * 100:.0f}%')
    # ax.legend()
    # plt.text(0.05, 0.95, f"r = {r:.2f}", transform=plt.gca().transAxes, fontsize=12)

    # img_name = f"in_out_percent_distribution.png"
    # plt.savefig(os.path.join(img_dir, img_name), bbox_inches='tight')
    plt.show()

In [None]:
zero_idx

### Amount of time spent outside of limits

We can do this in one minute intervals, basically develop a function that makes a list of segments of interrupted outside status with their lengths.