In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
edw = pd.read_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/edw/edw.csv'), low_memory=False)
epic = pd.read_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/mgh-echo-lab/epic.csv'), low_memory=False)
legacy = pd.read_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/mgh-echo-lab/legacy.csv'), low_memory=False)
merged = pd.read_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/echo.csv'), low_memory=False)

In [None]:
edw['date'] = pd.to_datetime(edw['date'])
epic['date'] = pd.to_datetime(epic['date'])
legacy['date'] = pd.to_datetime(legacy['date'])
merged['date'] = pd.to_datetime(merged['date'])

In [None]:
legacy['OrderProcedureID'] = pd.Series(list(range(-1, -1 * (len(legacy) + 1), -1)))

naive_merge = pd.concat([edw, epic, legacy])
truly_unique = naive_merge.drop_duplicates(['mrn', 'OrderProcedureID'], keep=False)
truly_unique_counts = truly_unique['source'].value_counts()
truly_unique_counts['Merged'] = 'NA'

In [None]:
def stats(df, name):
    print(
        f"--- {name} ---\n"
        f"Patients:\t\t{len(df['mrn'].drop_duplicates())}\n"
        f"Echos:\t\t\t{len(df)}\n"
        f"Echos unique to {name}:\t{truly_unique_counts[name]}\n"
        f"First date:\t\t{df['date'].min()}\n"
        f"Last date:\t\t{df['date'].max()}\n"
    )

In [None]:
for df, name in [(edw, "EDW"), (epic, "Epic"), (legacy, "Legacy"), (merged, "Merged")]:
    stats(df, name)

In [None]:
ecg = pd.read_csv(os.path.expanduser('~/explore-ecg/tensors_union.csv'))
ecg.columns = ["mrn", "date"]
ecg = ecg.dropna()
ecg["mrn"] = ecg["mrn"].astype(int)
ecg["date"] = pd.to_datetime(ecg["date"])
ecg = ecg[ecg["date"].between(pd.to_datetime('1981-06-01'), pd.to_datetime('2022-01-01'))]

In [None]:
def plot_test_count(df, title, xlabel, ylabel, save_path):
    fig, ax = plt.subplots(figsize=(10, 5))
    month_counts = df["mrn"].groupby([df["date"].dt.year, df["date"].dt.month]).count()
    ax.bar(range(len(month_counts)), month_counts, width=1)
    ax.tick_params(axis='y', which='major', labelsize=12)
    ax.tick_params(axis='x', which='major', labelsize=12)
    ax.set_ylabel(ylabel, fontsize=14)
    ax.set_xlabel(xlabel, fontsize=14)
    ax.set_title(title, fontsize=16)
    first_jan = (month_counts.index.get_level_values(1) == 1).argmax()
    ax.set_xticks(range(first_jan, len(month_counts), 12))
    month_labels = month_counts.index.map(lambda x: f"{x[0]}-{x[1]:02}")
    ax.set_xticklabels(month_labels[first_jan::12], rotation=90)
    plt.tight_layout()
    plt.savefig(save_path)

In [None]:
plot_test_count(
    df=ecg,
    title=f'ECGs per Month: n={len(ecg)}',
    xlabel='ECG date',
    ylabel='Number of ECGs',
    save_path=os.path.expanduser('~/dropbox/ecgnet-as/figures-and-tables/ecgs-per-month.pdf'),
)

In [None]:
plot_test_count(
    df=merged,
    title=f'Echos per Month: n={len(merged)}',
    xlabel='Echo date',
    ylabel='Number of Echos',
    save_path=os.path.expanduser('~/dropbox/ecgnet-as/figures-and-tables/echos-per-month.pdf'),
)

In [None]:
for key in ["AV Area", "AV Peak Velocity", "AV Mean Gradient", "AV Peak Gradient"]:
    print(f"{key}: {echo[key].isna().value_counts()[False]}")

In [None]:
edw['AV Peak Velocity'].hist(bins=40, range=(-0.5, 7.5))
plt.title('EDW: AV Peak Velocity (converted to m/s)')
plt.ylabel('Count')
plt.xlim(-0.5, 8)
plt.savefig(os.path.expanduser('~/dropbox/ecgnet-as/figures-and-tables/edw-peak-velocity.pdf'))

In [None]:
epic['AV Peak Velocity'].hist(bins=40)
plt.title('Echo Lab (Epic): AV Peak Velocity (unknown units)')
plt.ylabel('Count')
plt.xlim(-500, 8000)
plt.savefig(os.path.expanduser('~/dropbox/ecgnet-as/figures-and-tables/epic-peak-velocity.pdf'))

In [None]:
merged['AV Mean Gradient'].describe()

In [None]:
merged['AV Mean Gradient'].hist(bins=40)
plt.title(f'AV Mean Gradient: n={len(merged["AV Mean Gradient"].dropna())}')
plt.ylabel('Count')
plt.savefig(os.path.expanduser('~/dropbox/ecgnet-as/figures-and-tables/av-mean-gradient.pdf'))

In [None]:
merged['AV Mean Gradient'].hist(bins=40, range=(30, 40))
plt.ylabel('Count')
plt.title('AV Mean Gradient (30 - 40 mmHg)')
plt.savefig(os.path.expanduser('~/dropbox/ecgnet-as/figures-and-tables/av-mean-gradient-zoomed-in.pdf'))