In [None]:
import os
import numpy as np
from tqdm import tqdm
# import pandas as pd
import modin.pandas as pd

! python -m pip install "dask[dataframe]"
import dask.dataframe as dd

## Load metadata from HD5 ECGs

In [None]:
fpaths_mrns_old = os.path.expanduser("~/dropbox/ecg/explore/mgh/tensors_all_union.csv")
df_old = pd.read_csv(fpaths_mrns_old)
print(f"Loaded {fpaths_mrns_old} into df")

## Load new VM metadata (MRNs, ECG datetime, location)

In [None]:
# If current CSV does nost exist, create it by merging MRN + locations list
fpath_mrns_new      = os.path.expanduser("~/dropbox/ecg/new-vms/mgh-updated.csv")
fpath_mrns_new_locs = os.path.expanduser("~/dropbox/ecg/new-vms/mgh-updated-locations.csv")
fpath_mrns_updated  = os.path.expanduser("~/dropbox/ecg/new-vms/mgh-updated-merged.csv")

ecg_datetime_key = 'ECG_datetime'
ecg_mrn_key = 'PatientID'

if os.path.exists(fpath_mrns_updated):
    df_new = pd.read_csv(fpath_mrns_updated)

    # Convert column to datetime format
    df_new[ecg_datetime_key] = pd.to_datetime(df_new[ecg_datetime_key])
    
    print(f"Loaded {fpath_mrns_updated} into df with keys {df_new.keys()}")
else:
    df_new = dd.read_csv(fpath_mrns_new)
    print(f"Read {fpath_mrns_new} to dask dataframe with {len(df_new.index)} rows")
        
    df_new_locs = dd.read_csv(fpath_mrns_new_locs)
    print(f"Read {fpath_mrns_new_locs} to dask dataframe with {len(df_new_locs.index)} rows")
    
    # Merge the csv files.
    df_merged = dd.merge(df_new, df_new_locs, how='outer', on=['PatientID', 'ECG_datetime'])
    print(f'Merged two Dask dfs into one df with length {len(df_merged.index)}')
    
    # Convert Dask DF to Pandas DF, and overwrite df_new
    df_new = df_merged.compute()
    
    # Convert column to datetime format
    df_new[ecg_datetime_key] = pd.to_datetime(df_new[ecg_datetime_key])
    
    # Write the output
    df_new.to_csv(fpath_mrns_updated, index=False)
    print(f"Saved merged df to {fpath_mrns_updated}")

In [None]:
# Preview merged new DF
df_new.head()

## Load CSV of reference cohort

In [None]:
# fpath_ref = os.path.expanduser("~/dropbox/sts-data/mgh-all-features-labels.csv")
# cohort_name = "sts" 
# date_key = "surgdt"
# ref_key = "medrecn" 
# outcome_key = 'mtopd'

fpath_ref = os.path.expanduser("~/dropbox/apollo/ecg_pressures_labs_processed.csv")
cohort_name = "apollo"
date_key = "Date_of_Cath"
ref_key = "Patient_ID" 

df_ref = pd.read_csv(fpath_ref)
print(f"Loaded {fpath_ref} into df")
df_ref[date_key] = pd.to_datetime(df_ref[date_key])
df_ref

In [None]:
# Get all MRNs from reference CSV
mrn_ref = pd.to_numeric(df_ref[ref_key], errors="coerce")
mrn_ref_unique = np.unique(mrn_ref)
df_ref[ref_key] = mrn_ref
print(f'Reference: {len(mrn_ref)} total MRNs and {len(mrn_ref_unique)} unique MRNs')

In [None]:
# Get all MRNs in tensors_all
mrn_old = pd.to_numeric(df_old['ecg_patientid_clean'], errors="coerce")
mrn_old_unique = np.unique(mrn_old)
df_old['ecg_patientid_clean'] = mrn_old
print(f'Existing HD5 dataset: {len(mrn_old)} total MRNs and {len(mrn_old_unique)} unique MRNs')

mrn_intersect_old_ref = np.intersect1d(mrn_ref_unique, mrn_old_unique)
print(f'Intersect between existing HD5 dataset and reference cohort: {len(mrn_intersect_old_ref)} unique MRNs found in both')

In [None]:
# Get all MRNs from new VM CSV
mrn_new = pd.to_numeric(df_new['PatientID'], errors="coerce")
mrn_new_unique = np.unique(mrn_new)
df_new['PatientID'] = mrn_new
print(f'CSV from new VM: {len(mrn_new)} total MRNs and {len(mrn_new_unique)} unique MRNs')

In [None]:
# Get intersect between reference cohort and MRN
mrn_intersect_new_ref = np.intersect1d(mrn_ref_unique, mrn_new_unique)
print(f'Intersect between new VM CSVs and reference cohort: {len(mrn_intersect_new_ref)} unique MRNs found in both')

In [None]:
new_mrns = set(mrn_intersect_new_ref) - set(mrn_intersect_old_ref)
new_mrns = list(new_mrns)
print(f"Found {len(new_mrns)} new reference cohort MRNs in new VM CSV")

In [None]:
fpaths_mrns_prioritize = os.path.expanduser(f"~/dropbox/ecg/new-vms/mrns-to-prioritize-{cohort_name}.csv")
pd.DataFrame([int(mrn) for mrn in new_mrns]).to_csv(fpaths_mrns_prioritize, index=False, header=False)
print(f"Saved new reference cohort MRNs to {fpaths_mrns_prioritize}")

In [None]:
ecg_hits = {}

for mrn in tqdm(new_mrns):
    
    ecg_hits[mrn] = {}
    
    # Isolate DF rows for this MRN from reference cohort
    df_ref_mrn = df_ref[df_ref[ref_key] == mrn]
    
    # Save label for this MRN
    ecg_hits[mrn][outcome_key] = int(df_ref_mrn[outcome_key])
    
    # Isolate DF rows for this MRN from CSV from new VM
    df_new_ecgs = df_new[df_new[ecg_mrn_key] == mrn]
    
    # Get date of surgery for latest row of reference data for this patient
    start_date = df_ref_mrn[date_key].iloc[-1] - pd.Timedelta(value=30, unit='days')
    end_date = df_ref_mrn[date_key].iloc[-1]
    
    # Get boolean mask of all ECGs that are within window
    mask = (df_new_ecgs[ecg_datetime_key] > start_date) & (df_new_ecgs[ecg_datetime_key] < end_date)

    # If any hits, get the first date of the hit
    if mask.any():
        ecg_hits[mrn]['hit_dates'] = df_new_ecgs[ecg_datetime_key][mask].to_list()
    else:
         ecg_hits[mrn]['hit_dates'] = []

    # Add info to dicts
    ecg_hits[mrn]['start_date'] = start_date
    ecg_hits[mrn]['end_date'] = end_date
    ecg_hits[mrn]['hit_count'] = sum(mask)

df_ecg_hits = pd.DataFrame(ecg_hits).T

In [None]:
df_ecg_hits

In [None]:
print(f"Reference MRNs (total new from VM): {df_ecg_hits.shape[0]}")
print(f"Reference MRNs (1+ ECG in window): {sum(df_ecg_hits['hit_count'] > 0)}")
print(f"Total ECGs in window: {df_ecg_hits['hit_count'].sum()}")
print(f"Positive labels: {df_ecg_hits[outcome_key].sum()}")
print(f"Positive labels (with ECGs in window): {df_ecg_hits[df_ecg_hits['hit_count'] > 0]['mtopd'].sum()}")