In [None]:
import os
import pandas as pd

In [None]:
# Load split echo lab data
legacy = pd.read_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/mgh-echo-lab/legacy.csv'))
epic = pd.read_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/mgh-echo-lab/epic.csv'))

legacy["date"] = pd.to_datetime(legacy["date"])
epic["date"] = pd.to_datetime(epic["date"])

legacy.dropna(subset=["mrn", "date"], inplace=True)
epic.dropna(subset=["mrn", "date", "OrderProcedureID"], inplace=True)

In [None]:
# Load formatted EDW data
edw = pd.read_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/edw/data.csv'))
edw = edw[['mrn', 'date', 'AV Peak Gradient', 'AV Mean Gradient', 'AV Peak Velocity', 'AV Area', 'source', 'OrderProcedureID']]

edw["date"] = pd.to_datetime(edw["date"])

edw.dropna(subset=["mrn", "date", "OrderProcedureID"], inplace=True)

In [None]:
# Remove duplicate echos (by OrderProcedureID) within one source, keeping the later date
epic = epic.sort_values('date')
edw = edw.sort_values('date')

epic = epic.drop_duplicates(['mrn', 'OrderProcedureID'], keep='last')
edw = edw.drop_duplicates(['mrn', 'OrderProcedureID'], keep='last')

# Combine Epic from Echo Lab and EDW data pull for further processing
both = pd.concat([epic, edw]).reset_index(drop=True).sort_values('source')

In [None]:
dupe_mask = both.duplicated(['mrn', 'OrderProcedureID'], keep=False)

# Get unique echos by OrderProcedureID
truly_unique = both[~dupe_mask]

# Gather dupes
dupes = both[dupe_mask].sort_values('source')

# Dedupe echos by taking unique OrderProcedureID and Gradient/Area measurements,
# keeping the EDW version because its Velocity values are in m/s
deduped = dupes.drop_duplicates(['mrn', 'OrderProcedureID', 'AV Peak Gradient', 'AV Mean Gradient', 'AV Area'], keep='first')

In [None]:
# This shows how many echos each source found that the other did not
truly_unique['source'].value_counts()

In [None]:
# Merge all data sources
merged = pd.concat([truly_unique, deduped, legacy]).reset_index(drop=True)

In [None]:
# Save merged data
merged.to_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/echo.csv'), index=False)