In [ ]:
# Removes lint errors from VS Code
from typing import Dict, TYPE_CHECKING, Tuple, List

if TYPE_CHECKING:
    import kedro
    catalog: kedro.io.data_catalog.DataCatalog
    session: kedro.framework.session.session.KedroSession
    catalog: kedro.io.data_catalog.DataCatalog
    pipelines: Dict[str, kedro.pipeline.pipeline.Pipeline]

import pandas as pd
patients: pd.DataFrame = catalog.load('mimic.core_patients')
admissions: pd.DataFrame = catalog.load('mimic.core_admissions')
labevents: pd.DataFrame = catalog.load('mimic.hosp_labevents')

2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `mimic.core_patients` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `mimic.core_admissions` (ParquetDataSet)...
2000-01-01 00:00:00,000 - kedro.io.data_catalog - INFO - Loading data from `mimic.hosp_labevents` (ParquetDataSet)...


In [ ]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime


def mk_dates_relative(
    source_table: pd.DataFrame,
    source_idx_col: str,
    date_table: pd.DataFrame,
    date_col: str,
    date_format: str = None,
):
    rel_table = source_table.copy(deep=False)
    dates = date_table[date_col]

    # Some date indexes can be null, in this case drop the rows that contain them
    # Using a relative date would make no sense for them
    rel_table = rel_table.dropna(subset=[source_idx_col])

    # Some target columns are not dates by default
    # ex. patient table anchor_year is not a date
    if date_format:
        dates = pd.to_datetime(dates, format=date_format)
    else:
        assert is_datetime(dates), f"Date col {date_col} is not a datetime and no date_format was provided"

    # Create column that matches source_table
    synced_dates = dates[rel_table[source_idx_col]]
    synced_dates.index = rel_table.index

    for col in source_table.keys():
        if not is_datetime(source_table[col]):
            continue

        rel_table[col] = source_table[col] - synced_dates

    return rel_table


In [ ]:
# sensitive
mk_dates_relative(admissions, 'subject_id', patients, 'anchor_year', '%Y')

In [ ]:
# sensitive
labevents_short = labevents[:5000000]
mk_dates_relative(labevents_short, 'hadm_id', admissions, 'admittime')

In [ ]:
labevents_short = labevents[:40000000]
time_events = labevents_short[['subject_id', 'hadm_id', 'charttime', 'storetime']].merge(admissions.reset_index()[['subject_id', 'hadm_id', 'admittime', 'dischtime']], on='subject_id');

In [ ]:
hosp_events = (time_events['admittime'] < time_events['charttime']) & (time_events['charttime'] < time_events['dischtime'])
hosp_events_w_adm = hosp_events & (time_events['hadm_id_x'] == time_events['hadm_id_y'])
adm_events = labevents[:40000000].dropna(subset=['hadm_id']).merge(admissions.reset_index(), on='hadm_id')

hosp_events.sum(), hosp_events_w_adm.sum(), len(adm_events)

(22012063, 21740699, 22497081)

In [ ]:
len(patients), len(labevents[['subject_id']].merge(pd.DataFrame(admissions['subject_id'].unique(), columns=['subject_id']))['subject_id'].unique()), len(admissions['subject_id'].unique()), len(labevents['subject_id'].unique())

(382278, 246569, 256878, 328743)