In [1]:
# Cell 0: Imports & helper utilities
import pandas as pd
import numpy as np
from pathlib import Path
from math import ceil
from datetime import timedelta

pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 120)

print("Ready — imports done.")

Ready — imports done.


In [2]:
# Cell 1: Load admissions.csv (or use sample if file not present) and expand to one row per day (day_index)
csv_path = Path("admissions.csv")

admissions = pd.read_csv(csv_path, low_memory=False,
                            parse_dates=['admittime','dischtime','deathtime','edregtime','edouttime'])
print("Loaded admissions.csv from disk. Rows:", len(admissions))

# Ensure datetime types
admissions['admittime'] = pd.to_datetime(admissions['admittime'], errors='coerce')
admissions['dischtime'] = pd.to_datetime(admissions['dischtime'], errors='coerce')

# If dischtime missing, fill with admittime (so at least one day is produced)
admissions['dischtime'] = admissions['dischtime'].fillna(admissions['admittime'])

# Function to expand a single admission row into day rows
def expand_admission_row(row):
    adm_date = row['admittime'].normalize().date()
    dis_date = row['dischtime'].normalize().date()
    n_days = (dis_date - adm_date).days + 1
    if n_days <= 0:
        n_days = 1
    rows = []
    for d in range(n_days):
        new = {
            'subject_id': row['subject_id'],
            'hadm_id': row['hadm_id'],
            'day_index': int(d),
            'admittime': row['admittime'],
            'dischtime': row['dischtime'],
            'deathtime': row['deathtime'],
            'admission_type': row.get('admission_type', np.nan),
            'admit_provider_id': row.get('admit_provider_id', np.nan),
            'admission_location': row.get('admission_location', np.nan),
            'discharge_location': row.get('discharge_location', np.nan),
            'insurance': row.get('insurance', np.nan),
            'language': row.get('language', np.nan),
            'marital_status': row.get('marital_status', np.nan),
            'race': row.get('race', np.nan),
            'edregtime': row.get('edregtime', pd.NaT),
            'edouttime': row.get('edouttime', pd.NaT),
            'hospital_expire_flag': row.get('hospital_expire_flag', np.nan)
        }
        rows.append(new)
    return rows

# Expand all admissions
expanded = []
for _, r in admissions.iterrows():
    expanded.extend(expand_admission_row(r))

merged_initial = pd.DataFrame(expanded)
# Convert types
merged_initial[['subject_id','hadm_id','day_index']] = merged_initial[['subject_id','hadm_id','day_index']].astype('Int64')

print("Expanded admissions -> rows:", merged_initial.shape[0])


Loaded admissions.csv from disk. Rows: 21987
Expanded admissions -> rows: 325800


In [3]:
# Cell 2: Attach icustays -> mark ICU presence per (subject_id, hadm_id, day_index)
# - Preserves merged_initial unchanged
# - Produces merged_with_icu with ICU columns filled when row_date falls inside an icu stay

icu_path = Path("icustays.csv")
if not icu_path.exists():
    raise FileNotFoundError(f"icustays.csv not found at {icu_path.resolve()}  -- put the file next to admissions.csv")

# load icustays (parse datetimes)
icustays = pd.read_csv(icu_path, low_memory=False, parse_dates=['intime','outtime'])

# normalize / required columns (tolerant if some optional cols missing)
for col in ['subject_id','hadm_id','intime','outtime']:
    if col not in icustays.columns:
        raise KeyError(f"Expected column '{col}' in icustays.csv but it is missing.")

# optional helpful columns: stay_id, first_careunit, last_careunit, los
optional_cols = ['stay_id','first_careunit','last_careunit','los']
for c in optional_cols:
    if c not in icustays.columns:
        icustays[c] = pd.NA  # create if missing so downstream code is simpler

# --- prepare merged copy (do not modify merged_initial in-place) ---
merged_with_icu = merged_initial.copy().reset_index(drop=False).rename(columns={'index':'row_id'})
# add ICU output columns (keeps original names minimal)
for col in ['stay_id_icu','icustay_intime','icustay_outtime','first_careunit_icu','last_careunit_icu','los_icu']:
    if col not in merged_with_icu.columns:
        merged_with_icu[col] = pd.NA

# compute the date of each merged row (admission day + day_index)
# assume admittime exists and day_index present
if 'admittime' not in merged_with_icu.columns:
    raise KeyError("merged_initial must contain 'admittime' column")

# ensure admittime is datetime
merged_with_icu['admittime'] = pd.to_datetime(merged_with_icu['admittime'], errors='coerce')
merged_with_icu['day_index_int'] = merged_with_icu['day_index'].fillna(0).astype(int)
merged_with_icu['row_date'] = merged_with_icu['admittime'].dt.normalize() + pd.to_timedelta(merged_with_icu['day_index_int'], unit='D')

# normalize icu intime/outtime to dates (fill missing outtime with intime)
icustays['intime'] = pd.to_datetime(icustays['intime'], errors='coerce')
icustays['outtime'] = pd.to_datetime(icustays['outtime'], errors='coerce').fillna(icustays['intime'])
icustays['intime_norm'] = icustays['intime'].dt.normalize()
icustays['outtime_norm'] = icustays['outtime'].dt.normalize()

# create a merged candidate set joining by subject_id & hadm_id (many-to-many)
# keep the icu's key cols for matching and assignment
icu_keep = ['subject_id','hadm_id','stay_id','intime','outtime','intime_norm','outtime_norm','first_careunit','last_careunit','los']
candidate = merged_with_icu.merge(icustays[icu_keep], on=['subject_id','hadm_id'], how='left', suffixes=('','_icu'))

# mark rows where row_date is within icu interval
mask_in_icu = (candidate['row_date'] >= candidate['intime_norm']) & (candidate['row_date'] <= candidate['outtime_norm'])
candidate['in_icu'] = mask_in_icu.fillna(False)

# For rows that match multiple ICU stays, pick the earliest ICU (by intime)
# keep only matches; then for duplicates keep the one with smallest intime
matched = candidate[candidate['in_icu']].copy()
if not matched.empty:
    # sort so earliest ICU intime appears first for each original row
    matched = matched.sort_values(by=['row_id','intime'])
    # pick first match per row_id
    first_matches = matched.groupby('row_id', as_index=False).first()
    # map the ICU fields back into merged_with_icu by row_id
    map_cols = {
        'stay_id':'stay_id_icu',
        'intime':'icustay_intime',
        'outtime':'icustay_outtime',
        'first_careunit':'first_careunit_icu',
        'last_careunit':'last_careunit_icu',
        'los':'los_icu'
    }
    for src, dst in map_cols.items():
        # build mapping series
        mapping = first_matches.set_index('row_id')[src]
        merged_with_icu.loc[merged_with_icu['row_id'].isin(mapping.index), dst] = merged_with_icu.loc[merged_with_icu['row_id'].isin(mapping.index), 'row_id'].map(mapping)
    assigned_count = len(first_matches)
else:
    assigned_count = 0

# cleanup helper columns
merged_with_icu = merged_with_icu.drop(columns=['day_index_int','row_date'])

print(f"ICU assignment complete. Rows where ICU info filled: {int(assigned_count)}")
# preview some assigned rows (if any)
print(merged_with_icu[merged_with_icu['stay_id_icu'].notna()].head(20))

# merged_with_icu is ready for next steps (icustays merged)
# note: merged_initial remains unchanged


ICU assignment complete. Rows where ICU info filled: 93109
    row_id  subject_id   hadm_id  day_index           admittime           dischtime           deathtime  \
7        7    10001217  24597018          2 2157-11-18 22:56:00 2157-11-25 18:00:00                 NaT   
8        8    10001217  24597018          3 2157-11-18 22:56:00 2157-11-25 18:00:00                 NaT   
48      48    10003400  23559586          6 2137-08-04 00:07:00 2137-09-02 17:05:00 2137-09-02 17:05:00   
49      49    10003400  23559586          7 2137-08-04 00:07:00 2137-09-02 17:05:00 2137-09-02 17:05:00   
50      50    10003400  23559586          8 2137-08-04 00:07:00 2137-09-02 17:05:00 2137-09-02 17:05:00   
51      51    10003400  23559586          9 2137-08-04 00:07:00 2137-09-02 17:05:00 2137-09-02 17:05:00   
55      55    10003400  23559586         13 2137-08-04 00:07:00 2137-09-02 17:05:00 2137-09-02 17:05:00   
56      56    10003400  23559586         14 2137-08-04 00:07:00 2137-09-02 17:05:00 2

In [4]:
# --- فقط تغییر اسم و ترتیب ستون‌های ICU ---

# تغییر نام ستون‌ها
rename_map = {
    'stay_id_icu': 'stay_id',
    'first_careunit_icu': 'first_careunit',
    'last_careunit_icu': 'last_careunit',
    'icustay_intime': 'icustays_intime',
    'icustay_outtime': 'icustays_outtime',
    'los_icu': 'los'
}
merged_with_icu = merged_with_icu.rename(columns=rename_map)

# جابجایی ترتیب ستون‌های ICU
icu_cols_ordered = ['stay_id', 'first_careunit', 'last_careunit', 'icustays_intime', 'icustays_outtime', 'los']

# بقیه ستون‌ها (به جز ICU)
other_cols = [c for c in merged_with_icu.columns if c not in icu_cols_ordered]

# بازآرایی نهایی
merged_with_icu = merged_with_icu[other_cols + icu_cols_ordered]

print("Renaming & reordering complete.")
print(merged_with_icu[icu_cols_ordered].head(20))


Renaming & reordering complete.
       stay_id                       first_careunit                        last_careunit      icustays_intime  \
0         <NA>                                 <NA>                                 <NA>                 <NA>   
1         <NA>                                 <NA>                                 <NA>                 <NA>   
2         <NA>                                 <NA>                                 <NA>                 <NA>   
3         <NA>                                 <NA>                                 <NA>                 <NA>   
4         <NA>                                 <NA>                                 <NA>                 <NA>   
5         <NA>                                 <NA>                                 <NA>                 <NA>   
6         <NA>                                 <NA>                                 <NA>                 <NA>   
7   37067082.0  Surgical Intensive Care Unit (SICU)  Surgical In

In [6]:
merged_initial

Unnamed: 0,subject_id,hadm_id,day_index,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000935,26381316,0,2187-08-23 21:22:00,2187-08-27 15:35:00,NaT,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0
1,10000935,26381316,1,2187-08-23 21:22:00,2187-08-27 15:35:00,NaT,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0
2,10000935,26381316,2,2187-08-23 21:22:00,2187-08-27 15:35:00,NaT,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0
3,10000935,26381316,3,2187-08-23 21:22:00,2187-08-27 15:35:00,NaT,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0
4,10000935,26381316,4,2187-08-23 21:22:00,2187-08-27 15:35:00,NaT,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325795,19999828,25744818,6,2149-01-08 16:44:00,2149-01-18 17:00:00,NaT,EW EMER.,P13JMH,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicaid,English,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0
325796,19999828,25744818,7,2149-01-08 16:44:00,2149-01-18 17:00:00,NaT,EW EMER.,P13JMH,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicaid,English,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0
325797,19999828,25744818,8,2149-01-08 16:44:00,2149-01-18 17:00:00,NaT,EW EMER.,P13JMH,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicaid,English,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0
325798,19999828,25744818,9,2149-01-08 16:44:00,2149-01-18 17:00:00,NaT,EW EMER.,P13JMH,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicaid,English,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0


In [7]:
merged_with_icu

Unnamed: 0,row_id,subject_id,hadm_id,day_index,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag,stay_id,first_careunit,last_careunit,icustays_intime,icustays_outtime,los
0,0,10000935,26381316,0,2187-08-23 21:22:00,2187-08-27 15:35:00,NaT,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,
1,1,10000935,26381316,1,2187-08-23 21:22:00,2187-08-27 15:35:00,NaT,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,
2,2,10000935,26381316,2,2187-08-23 21:22:00,2187-08-27 15:35:00,NaT,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,
3,3,10000935,26381316,3,2187-08-23 21:22:00,2187-08-27 15:35:00,NaT,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,
4,4,10000935,26381316,4,2187-08-23 21:22:00,2187-08-27 15:35:00,NaT,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325795,325795,19999828,25744818,6,2149-01-08 16:44:00,2149-01-18 17:00:00,NaT,EW EMER.,P13JMH,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicaid,English,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0,,,,,,
325796,325796,19999828,25744818,7,2149-01-08 16:44:00,2149-01-18 17:00:00,NaT,EW EMER.,P13JMH,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicaid,English,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0,,,,,,
325797,325797,19999828,25744818,8,2149-01-08 16:44:00,2149-01-18 17:00:00,NaT,EW EMER.,P13JMH,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicaid,English,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0,,,,,,
325798,325798,19999828,25744818,9,2149-01-08 16:44:00,2149-01-18 17:00:00,NaT,EW EMER.,P13JMH,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicaid,English,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0,,,,,,


In [None]:
merged_initial.to_csv('merged_initial.csv')

In [None]:
merged_with_icu.to_csv('merged_with_icu.csv')

In [5]:
merged_with_icu.head(100).to_csv('merged_with_icu_sample.csv')

In [13]:
# Cell X: merge all_vanco.csv into merged_with_icu (or merged_initial if icu-step not present)
vanco_path = Path("all_vanco.csv")
if not vanco_path.exists():
    raise FileNotFoundError(f"all_vanco.csv not found at {vanco_path.resolve()}")

# load and parse
all_vanco = pd.read_csv(vanco_path, low_memory=False, parse_dates=['charttime'])

# normalize ids to integers (some hadm_id may have .0)
all_vanco['subject_id'] = pd.to_numeric(all_vanco['subject_id'], errors='coerce').astype('Int64')
all_vanco['hadm_id'] = pd.to_numeric(all_vanco['hadm_id'], errors='coerce').astype('Int64')
all_vanco['itemid'] = pd.to_numeric(all_vanco['itemid'], errors='coerce').astype('Int64')

# helper: resolve numeric value for comparison
def resolve_numeric(row):
    v = row.get('value')
    vn = row.get('valuenum')
    # treat missing-like tokens as missing
    if pd.isna(v) or str(v).strip() in ['', '___', 'NaN', 'nan']:
        try:
            return float(vn) if not pd.isna(vn) else np.nan
        except:
            return np.nan
    # try parse value (may contain commas, spaces)
    s = str(v).strip().replace(',', '')
    try:
        return float(s)
    except:
        # fallback to valuenum
        try:
            return float(vn) if not pd.isna(vn) else np.nan
        except:
            return np.nan

all_vanco['resolved_val'] = all_vanco.apply(resolve_numeric, axis=1)

# Choose base merged DF to join into (prefer merged_with_icu if exists)


# ensure admittime exists and is datetime
merged_with_vanco = merged_with_icu.copy()
if 'admittime' not in merged_with_vanco.columns:
    raise KeyError(f"merged_with_vanco must contain 'admittime' column before merging labs.")
merged_with_vanco['admittime'] = pd.to_datetime(merged_with_vanco['admittime'], errors='coerce')

# build admit lookup (one admittime per subject_id,hadm_id)
admit_map = merged_with_vanco.groupby(['subject_id','hadm_id'], dropna=False)['admittime'].first().reset_index().rename(columns={'admittime':'admit_time'})
admit_map['admit_date'] = pd.to_datetime(admit_map['admit_time']).dt.normalize()

# merge admit_date into all_vanco to compute day index of each lab
all_vanco = all_vanco.merge(admit_map[['subject_id','hadm_id','admit_date']], on=['subject_id','hadm_id'], how='left')

# if admit_date missing -> we cannot compute day_index -> drop those rows (or keep with NaN day)
missing_admit = all_vanco['admit_date'].isna().sum()
if missing_admit:
    print(f"Warning: {missing_admit} all_vanco rows have no matching admission (admit_date missing) and will be skipped.")
all_vanco = all_vanco[all_vanco['admit_date'].notna()].copy()

# compute lab day index relative to admission day (day0 = admittime.normalize())
all_vanco['chart_date'] = pd.to_datetime(all_vanco['charttime'], errors='coerce').dt.normalize()
all_vanco['day_index_lab'] = (all_vanco['chart_date'] - all_vanco['admit_date']).dt.days.fillna(0).astype(int)
# clamp negative days to 0
all_vanco.loc[all_vanco['day_index_lab'] < 0, 'day_index_lab'] = 0

# For each (subject, hadm, day_index_lab) pick the row with maximum resolved_val
group_cols = ['subject_id','hadm_id','day_index_lab']
# drop rows where resolved_val is NaN (no usable numeric) — they won't contribute to max
usable = all_vanco[~all_vanco['resolved_val'].isna()].copy()
if usable.empty:
    print("No usable numeric vanco values found to aggregate.")
    # create empty daily_vanco with expected cols
    daily_vanco = pd.DataFrame(columns=['subject_id','hadm_id','day_index_lab',
                                       'itemid','charttime','value','valuenum','valueuom','flag','resolved_val'])
else:
    idx = usable.groupby(group_cols)['resolved_val'].idxmax()
    daily_vanco = usable.loc[idx].copy()

# rename columns to DBML names
daily_vanco = daily_vanco.rename(columns={
    'itemid':'all_vanco_itemid',
    'charttime':'all_vanco_charttime',
    'value':'all_vanco_value',
    'valuenum':'all_vanco_valuenum',
    'valueuom':'all_vanco_valueuom',
    'flag':'all_vanco_flag',
    'day_index_lab':'day_index'
})

# keep only needed cols for merging
merge_cols = ['subject_id','hadm_id','day_index',
              'all_vanco_itemid','all_vanco_charttime','all_vanco_value','all_vanco_valuenum','all_vanco_valueuom','all_vanco_flag']
daily_vanco = daily_vanco[merge_cols]

# ensure types align
daily_vanco['subject_id'] = daily_vanco['subject_id'].astype('Int64')
daily_vanco['hadm_id'] = daily_vanco['hadm_id'].astype('Int64')
daily_vanco['day_index'] = daily_vanco['day_index'].astype('Int64')

# merge into base (left join so all base rows remain); prefer existing base as left
merged_with_vanco = merged_with_vanco.merge(daily_vanco, on=['subject_id','hadm_id','day_index'], how='left')

print(f"all_vanco merged -> rows with vanco info: {int(merged_with_vanco['all_vanco_itemid'].notna().sum())}")

# quick preview (first 20 rows that got vanco info)
preview = merged_with_vanco[merged_with_vanco['all_vanco_itemid'].notna()].head(20)
print(preview[['subject_id','hadm_id','day_index',
               'all_vanco_itemid','all_vanco_charttime','all_vanco_value','all_vanco_valuenum','all_vanco_valueuom','all_vanco_flag']])


all_vanco merged -> rows with vanco info: 52915
     subject_id   hadm_id  day_index  all_vanco_itemid all_vanco_charttime all_vanco_value  all_vanco_valuenum  \
2      10000935  26381316          2             51009 2187-08-25 07:50:00            13.7                13.7   
9      10001217  24597018          4             51009 2157-11-22 15:50:00            11.9                11.9   
14     10001401  27060146          1             51009 2131-10-02 06:40:00             8.3                 8.3   
16     10001401  27060146          3             51009 2131-10-04 07:10:00            10.3                10.3   
19     10002769  26924260          1             51009 2186-04-17 21:50:00            17.5                17.5   
21     10002769  26924260          3             51009 2186-04-19 10:30:00            25.6                25.6   
24     10002769  26924260          6             51009 2186-04-22 05:15:00            18.3                18.3   
26     10002769  26924260          8    

In [14]:
merged_with_vanco.head(100).to_csv('merged_with_vanco_sample.csv')