In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
# Load raw data
legacy = pd.read_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/mgh-echo-lab/raw-1.csv'), low_memory=False)
epic = pd.read_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/mgh-echo-lab/raw-2.csv'), low_memory=False)

In [None]:
# Split data based on source
legacy = legacy[legacy['DataSource'] == 'Legacy']
epic = epic[epic['DataSource'] == 'Epic']

In [None]:
# Check that ResultDTS is included for all Epic echos
assert len(epic) == len(epic['ResultDTS'].dropna())

In [None]:
# Insert dummy column
legacy['OrderID'] = np.nan

In [None]:
# Rename columns
column_mapping = {
    'MGH_MRN': 'mrn',
    'AV_VPeakGradient': 'AV Peak Gradient',
    'AV_MeanGradient': 'AV Mean Gradient',
    'AV_PeakVelocity': 'AV Peak Velocity',
    'AV_Area': 'AV Area',
    'DataSource': 'source',
    'OrderID': 'OrderProcedureID',
}

legacy_mapping = column_mapping.copy()
epic_mapping = column_mapping.copy()

legacy_mapping["EventDate"] = "date"
epic_mapping["ResultDTS"] = "date"

columns = list(epic_mapping.values())

legacy = legacy.rename(columns=legacy_mapping)[columns]
epic = epic.rename(columns=epic_mapping)[columns]

In [None]:
# Cleanup column
epic['OrderProcedureID'] = epic['OrderProcedureID'].astype(int)

legacy['date'] = pd.to_datetime(legacy['date'])
epic['date'] = pd.to_datetime(epic['date'])

In [None]:
# Remove duplicate echos, keeping the later echo time

legacy = legacy.sort_values('date')
epic = epic.sort_values('date')

legacy = legacy.drop_duplicates()
epic = epic.drop_duplicates(['mrn', 'OrderProcedureID', 'AV Peak Gradient', 'AV Mean Gradient', 'AV Peak Velocity', 'AV Area'], keep='last')

In [None]:
# Check that there are no duplicate echos with different values
assert len(epic) == len(epic.drop_duplicates(['mrn', 'OrderProcedureID']))

In [None]:
# Do not save nan (replace with empty string)
legacy.fillna('', inplace=True)
epic.fillna('', inplace=True)

In [None]:
# Save split data
legacy.to_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/mgh-echo-lab/legacy.csv'), index=False)
epic.to_csv(os.path.expanduser('~/dropbox/ecgnet-as/data/mgh-echo-lab/epic.csv'), index=False)