In [None]:
import os
import pandas as pd
from IPython.display import clear_output

In [None]:
def p(d, max_rows=None, max_cols=None):
    with pd.option_context('display.max_rows', max_rows, 'display.max_columns', max_cols, 'display.float_format', lambda x: f'{x}', 'display.max_colwidth', None):
        display(d)

In [None]:
df = pd.read_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/edw/raw-edw-echo.csv'), low_memory=False)

In [None]:
# Column Names
# df.keys()

In [None]:
# Setup to get relevant measurement names
peak_velocity_name = "AV Peak Velocity"
# velocity_ratio_name = "AV Velocity Ratio"
peak_gradient_name = "AV Peak Gradient"
mean_gradient_name = "AV Mean Gradient"
valve_area_name = "AV Area"
indexed_valve_area_name = "Indexed AV Area"

components = df['ComponentNM'].drop_duplicates().reset_index(drop=True)
meas_mask = components.str.lower().str.contains('peak|grad|vel|mean|area|max')
av_mask = components.str.lower().str.contains('av|ao')
exclude_mask = components.str.lower().str.contains('prosthetic|stress|mitral|pulmonic|tricuspid|outflow|lvot')
to_map = components[meas_mask&av_mask&~exclude_mask].reset_index(drop=True)

n_to_name = {
    1: peak_velocity_name,
    2: peak_gradient_name,
    3: mean_gradient_name,
    4: valve_area_name,
    5: indexed_valve_area_name,
#     6: velocity_ratio_name,
    0: "Discard",
}
n_to_name_str = str(n_to_name)
n_to_name_str = n_to_name_str[1:-2]
n_to_name_str = n_to_name_str.replace(", ", "\n") + "\n"
n_to_name_str = n_to_name_str.replace("'", "")

def valid_option(x):
    try:
        x = int(x)
        return x in n_to_name
    except ValueError:
        return False

# p(components[mask].reset_index(drop=True))

In [None]:
# Make sure that we're not dropping anything
# p(components[~(meas_mask & av_mask)])

In [None]:
# Manually map measurement names (skip if existing mapping is OK)
name_map = {}
for i, component in enumerate(to_map):
    print(f"{i+1} / {len(to_map)}")
    print(component)
    print()
    while not valid_option(n := input(n_to_name_str)):
        print("Please choose a valid category")
    clear_output(wait=True)
    n = int(n)
    name_map[component] = n_to_name[n]

name_map = {k: v for k, v in name_map.items() if v != "Discard"}

In [None]:
name_map = {
#     'AV AREA-PISA':                  'AV Area',
#     'AV COMP AREA':                  'AV Area',
#     'AV INCOMP AREA':                'AV Area',
    'AV VALVE AREA':                 'AV Area',
    'AORTIC VALVE AREA (1)':         'AV Area',
    'AORTIC VALVE AREA (2)':         'AV Area',
#     'AORTIC VALVE ANNULUS AREA (1)': 'AV Area',

    'AV MEAN GRADIENT':               'AV Mean Gradient',
    'AORTIC VALVE MEAN GRADIENT (1)': 'AV Mean Gradient',
    'AORTIC VALVE MEAN GRADIENT (2)': 'AV Mean Gradient',

    'AV PEAK GRADIENT':               'AV Peak Gradient',
    'AORTIC VALVE PEAK GRADIENT (1)': 'AV Peak Gradient',
    'AORTIC VALVE PEAK GRADIENT (2)': 'AV Peak Gradient',

#     'DOP CALC AO PEAK VEL':           'AV Peak Velocity',
    'AV PEAK VELOCITY':               'AV Peak Velocity',
    'AORTIC VALVE PEAK VELOCITY (1)': 'AV Peak Velocity',
    'AORTIC VALVE PEAK VELOCITY (2)': 'AV Peak Velocity',
#     'AORTIC VALVE PEAK DIASTOLIC VELOCITY (1)': 'AV Peak Velocity',
    
    'AORTIC VALVE AREA INDEX (1)': 'Indexed AV Area',
    'AORTIC VALVE AREA INDEX (2)': 'Indexed AV Area',

#     'AV VELOCITY RATIO': 'AV Velocity Ratio',
}

In [None]:
# only get measurements defined in name_map
as_df = df[df['ComponentNM'].apply(lambda key: key in name_map)].copy()

# label measurements with standard name and make friendlier names
as_df['measurement'] = as_df['ComponentNM'].apply(lambda key: name_map[key])
as_df['value'] = as_df['ResultValueNBR']
as_df['units'] = as_df['ReferenceRangeUnitCD']

# remove exact duplicates measurements (same patient, same echo, same measurement, same value)
# remove measurements with nan results
as_df = as_df.drop_duplicates(['MRN', 'OrderProcedureID', 'measurement', 'value']).dropna(subset=['value'])

In [None]:
# for a given echo, there should only be 1 result per measurement (exact duplicates were already removed)
for name, group in as_df.groupby('measurement'):
    dupes = group[group.duplicated('OrderProcedureID', keep=False)]
    if len(dupes) != 0:
        print(f"Duplicate measurements for {name} found, please fix before proceding:")
        p(dupes.sort_values('MRN'), max_rows=20)
        break

In [None]:
# reformat data from 1 row per measurement per echo to echo by measurement table
non_pivot_columns = ['MRN', 'PatientID', 'PatientEncounterID', 'ProcedureID', 'ProcedureDSC', 'OrderProcedureID', 'OrderDTS', 'StartDTS', 'EndDTS', 'ResultDTS', 'OrderTypeDSC', 'OrderDisplayNM', 'ComponentObservedDTS', 'SpecimenReceivedTimeDTS', 'SpecimenTakenTimeDTS']
pivot_columns = ['measurement']
pivot_values = ['value', 'units']

data = as_df.pivot(index=non_pivot_columns, columns=pivot_columns, values=pivot_values)
data.columns = data.columns.to_flat_index().map(lambda k: k[1] if k[0] == "value" else f"{k[1]} {k[0]}")
data = data.reset_index()
data

In [None]:
n_patients = len(data['MRN'].drop_duplicates())
n_echos = len(data[['MRN', 'OrderProcedureID']].drop_duplicates())
n_as = len(data.dropna(subset=[peak_velocity_name, mean_gradient_name, valve_area_name])[['MRN', 'OrderProcedureID']].drop_duplicates())

print(f"Number of Patients:\t\t{n_patients}")
print(f"Number of Echos:\t\t{n_echos}")
print(f"Number of Echos w/ AS values:\t{n_as}")

In [None]:
data.to_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/edw/edw-echo.csv'), index=False)