In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict
import re
import gc

pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 120)

In [None]:
csv_path = Path("admissions.csv")

admissions = pd.read_csv(csv_path, low_memory=False,
                            parse_dates=['admittime','dischtime','deathtime','edregtime','edouttime'])
print("Loaded admissions.csv from disk. Rows:", len(admissions))

admissions['admittime'] = pd.to_datetime(admissions['admittime'], errors='coerce')
admissions['dischtime'] = pd.to_datetime(admissions['dischtime'], errors='coerce')

admissions['dischtime'] = admissions['dischtime'].fillna(admissions['admittime'])

def expand_admission_row(row):
    adm_date = row['admittime'].normalize().date()
    dis_date = row['dischtime'].normalize().date()
    n_days = (dis_date - adm_date).days + 1
    if n_days <= 0:
        n_days = 1
    rows = []
    for d in range(n_days):
        new = {
            'subject_id': row['subject_id'],
            'hadm_id': row['hadm_id'],
            'day_index': int(d),
            'admittime': row['admittime'],
            'dischtime': row['dischtime'],
            'deathtime': row['deathtime'],
            'admission_type': row.get('admission_type', np.nan),
            'admit_provider_id': row.get('admit_provider_id', np.nan),
            'admission_location': row.get('admission_location', np.nan),
            'discharge_location': row.get('discharge_location', np.nan),
            'insurance': row.get('insurance', np.nan),
            'language': row.get('language', np.nan),
            'marital_status': row.get('marital_status', np.nan),
            'race': row.get('race', np.nan),
            'edregtime': row.get('edregtime', pd.NaT),
            'edouttime': row.get('edouttime', pd.NaT),
            'hospital_expire_flag': row.get('hospital_expire_flag', np.nan)
        }
        rows.append(new)
    return rows

expanded = []
for _, r in admissions.iterrows():
    expanded.extend(expand_admission_row(r))

merged_initial = pd.DataFrame(expanded)
merged_initial[['subject_id','hadm_id','day_index']] = merged_initial[['subject_id','hadm_id','day_index']].astype('Int64')

print("Expanded admissions -> rows:", merged_initial.shape[0])


In [None]:
merged_initial.to_csv("admissions_expanded.csv", index=False)

In [None]:
merged_initial = pd.read_csv("admissions_expanded.csv")

In [None]:
icu_path = Path("icustays.csv")
if not icu_path.exists():
    raise FileNotFoundError(f"icustays.csv not found at {icu_path.resolve()}  -- put the file next to admissions.csv")

icustays = pd.read_csv(icu_path, low_memory=False, parse_dates=['intime','outtime'])

for col in ['subject_id','hadm_id','intime','outtime']:
    if col not in icustays.columns:
        raise KeyError(f"Expected column '{col}' in icustays.csv but it is missing.")

optional_cols = ['stay_id','first_careunit','last_careunit','los']
for c in optional_cols:
    if c not in icustays.columns:
        icustays[c] = pd.NA

merged_with_icu = merged_initial.copy().reset_index(drop=False).rename(columns={'index':'row_id'})
for col in ['stay_id_icu','icustay_intime','icustay_outtime','first_careunit_icu','last_careunit_icu','los_icu']:
    if col not in merged_with_icu.columns:
        merged_with_icu[col] = pd.NA

if 'admittime' not in merged_with_icu.columns:
    raise KeyError("merged_initial must contain 'admittime' column")

merged_with_icu['admittime'] = pd.to_datetime(merged_with_icu['admittime'], errors='coerce')
merged_with_icu['day_index_int'] = merged_with_icu['day_index'].fillna(0).astype(int)
merged_with_icu['row_date'] = merged_with_icu['admittime'].dt.normalize() + pd.to_timedelta(merged_with_icu['day_index_int'], unit='D')

icustays['intime'] = pd.to_datetime(icustays['intime'], errors='coerce')
icustays['outtime'] = pd.to_datetime(icustays['outtime'], errors='coerce').fillna(icustays['intime'])
icustays['intime_norm'] = icustays['intime'].dt.normalize()
icustays['outtime_norm'] = icustays['outtime'].dt.normalize()

icu_keep = ['subject_id','hadm_id','stay_id','intime','outtime','intime_norm','outtime_norm','first_careunit','last_careunit','los']
candidate = merged_with_icu.merge(icustays[icu_keep], on=['subject_id','hadm_id'], how='left', suffixes=('','_icu'))

mask_in_icu = (candidate['row_date'] >= candidate['intime_norm']) & (candidate['row_date'] <= candidate['outtime_norm'])
candidate['in_icu'] = mask_in_icu.fillna(False)

matched = candidate[candidate['in_icu']].copy()
if not matched.empty:
    matched = matched.sort_values(by=['row_id','intime'])
    first_matches = matched.groupby('row_id', as_index=False).first()
    map_cols = {
        'stay_id':'stay_id_icu',
        'intime':'icustay_intime',
        'outtime':'icustay_outtime',
        'first_careunit':'first_careunit_icu',
        'last_careunit':'last_careunit_icu',
        'los':'los_icu'
    }
    for src, dst in map_cols.items():
        mapping = first_matches.set_index('row_id')[src]
        merged_with_icu.loc[merged_with_icu['row_id'].isin(mapping.index), dst] = merged_with_icu.loc[merged_with_icu['row_id'].isin(mapping.index), 'row_id'].map(mapping)
    assigned_count = len(first_matches)
else:
    assigned_count = 0

merged_with_icu = merged_with_icu.drop(columns=['day_index_int','row_date'])

print(f"ICU assignment complete. Rows where ICU info filled: {int(assigned_count)}")
print(merged_with_icu[merged_with_icu['stay_id_icu'].notna()].head(20))


In [None]:
rename_map = {
    'stay_id_icu': 'stay_id',
    'first_careunit_icu': 'first_careunit',
    'last_careunit_icu': 'last_careunit',
    'icustay_intime': 'icustays_intime',
    'icustay_outtime': 'icustays_outtime',
    'los_icu': 'los'
}
merged_with_icu = merged_with_icu.rename(columns=rename_map)

icu_cols_ordered = ['stay_id', 'first_careunit', 'last_careunit', 'icustays_intime', 'icustays_outtime', 'los']

other_cols = [c for c in merged_with_icu.columns if c not in icu_cols_ordered]

merged_with_icu = merged_with_icu[other_cols + icu_cols_ordered]

print("Renaming & reordering complete.")
print(merged_with_icu[icu_cols_ordered].head(20))


In [None]:
merged_initial

In [None]:
merged_with_icu

In [None]:
merged_initial.to_csv('merged_initial.csv')

In [None]:
merged_with_icu.to_csv('merged_with_icu.csv')

In [None]:
merged_with_icu.head(100).to_csv('merged_with_icu_sample.csv')

In [None]:
vanco_path = Path("all_vanco.csv")
if not vanco_path.exists():
    raise FileNotFoundError(f"all_vanco.csv not found at {vanco_path.resolve()}")

all_vanco = pd.read_csv(vanco_path, low_memory=False, parse_dates=['charttime'])

all_vanco['subject_id'] = pd.to_numeric(all_vanco['subject_id'], errors='coerce').astype('Int64')
all_vanco['hadm_id'] = pd.to_numeric(all_vanco['hadm_id'], errors='coerce').astype('Int64')

def resolve_numeric(row):
    v = row.get('value')
    vn = row.get('valuenum')
    if pd.isna(v) or str(v).strip() in ['', '___', 'NaN', 'nan']:
        try:
            return float(vn) if not pd.isna(vn) else np.nan
        except:
            return np.nan
    s = str(v).strip().replace(',', '')
    try:
        return float(s)
    except:
        try:
            return float(vn) if not pd.isna(vn) else np.nan
        except:
            return np.nan

all_vanco['resolved_val'] = all_vanco.apply(resolve_numeric, axis=1)

merged_with_vanco = merged_with_icu.copy()
if 'admittime' not in merged_with_vanco.columns:
    raise KeyError(f"merged_with_vanco must contain 'admittime' column before merging labs.")
merged_with_vanco['admittime'] = pd.to_datetime(merged_with_vanco['admittime'], errors='coerce')

admit_map = merged_with_vanco.groupby(['subject_id','hadm_id'], dropna=False)['admittime'].first().reset_index().rename(columns={'admittime':'admit_time'})
admit_map['admit_date'] = pd.to_datetime(admit_map['admit_time']).dt.normalize()

all_vanco = all_vanco.merge(admit_map[['subject_id','hadm_id','admit_date']], on=['subject_id','hadm_id'], how='left')

missing_admit = all_vanco['admit_date'].isna().sum()
if missing_admit:
    print(f"Warning: {missing_admit} all_vanco rows have no matching admission (admit_date missing) and will be skipped.")
all_vanco = all_vanco[all_vanco['admit_date'].notna()].copy()

all_vanco['chart_date'] = pd.to_datetime(all_vanco['charttime'], errors='coerce').dt.normalize()
all_vanco['day_index_lab'] = (all_vanco['chart_date'] - all_vanco['admit_date']).dt.days.fillna(0).astype(int)
all_vanco.loc[all_vanco['day_index_lab'] < 0, 'day_index_lab'] = 0

group_cols = ['subject_id','hadm_id','day_index_lab']
usable = all_vanco[~all_vanco['resolved_val'].isna()].copy()
if usable.empty:
    print("No usable numeric vanco values found to aggregate.")
    daily_vanco = pd.DataFrame(columns=['subject_id','hadm_id','day_index_lab',
                                       'charttime','value','valuenum','valueuom','flag','resolved_val'])
else:
    idx = usable.groupby(group_cols)['resolved_val'].idxmax()
    daily_vanco = usable.loc[idx].copy()

daily_vanco = daily_vanco.rename(columns={
    'charttime':'all_vanco_charttime',
    'value':'all_vanco_value',
    'valuenum':'all_vanco_valuenum',
    'valueuom':'all_vanco_valueuom',
    'flag':'all_vanco_flag',
    'day_index_lab':'day_index'
})

merge_cols = ['subject_id','hadm_id','day_index',
              'all_vanco_charttime','all_vanco_value','all_vanco_valuenum','all_vanco_valueuom','all_vanco_flag']
daily_vanco = daily_vanco[merge_cols]

daily_vanco['subject_id'] = daily_vanco['subject_id'].astype('Int64')
daily_vanco['hadm_id'] = daily_vanco['hadm_id'].astype('Int64')
daily_vanco['day_index'] = daily_vanco['day_index'].astype('Int64')

merged_with_vanco = merged_with_vanco.merge(daily_vanco, on=['subject_id','hadm_id','day_index'], how='left')

print(f"all_vanco merged -> rows with vanco info: {int(merged_with_vanco['all_vanco_charttime'].notna().sum())}")

preview = merged_with_vanco[merged_with_vanco['all_vanco_charttime'].notna()].head(20)
print(preview[['subject_id','hadm_id','day_index',
               'all_vanco_charttime','all_vanco_value','all_vanco_valuenum','all_vanco_valueuom','all_vanco_flag']])


In [None]:
merged_with_vanco.head(100).to_csv('merged_with_vanco_sample.csv')

In [None]:
chartevents_path = Path("chartevents.csv")
chartevents = pd.read_csv(chartevents_path, nrows=1000)


In [None]:
pd.read_csv(chartevents_path,nrows=100)

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [None]:
print(chartevents.head(100))

In [None]:
chartevents_path = Path("chartevents.csv")
if not chartevents_path.exists():
    raise FileNotFoundError(f"chartevents.csv not found at {chartevents_path.resolve()}")

usecols = ['subject_id','hadm_id','itemid','charttime','value','valuenum']
chunksize = 200_000

total_counts = defaultdict(int)
present_counts = defaultdict(int)

reader = pd.read_csv(chartevents_path, usecols=usecols, chunksize=chunksize, low_memory=True)

chunk_i = 0
for chunk in reader:
    chunk_i += 1
    chunk['itemid'] = pd.to_numeric(chunk['itemid'], errors='coerce').astype('Int64')
    chunk = chunk[chunk['itemid'].notna()]
    if chunk.empty:
        continue

    present_mask = ~chunk['valuenum'].isna()

    need_check = chunk['valuenum'].isna()
    if need_check.any():
        vals = chunk.loc[need_check, 'value'].astype(str).str.strip()
        good = ~vals.isin(["", "___", "NaN", "nan", "None", "none"])
        present_mask.loc[need_check] = good.values

    grp_total = chunk.groupby('itemid').size()
    grp_present = present_mask.groupby(chunk['itemid']).sum()

    for item, cnt in grp_total.items():
        total_counts[int(item)] += int(cnt)
    for item, cnt in grp_present.items():
        if pd.isna(item):
            continue
        present_counts[int(item)] += int(cnt)

    if chunk_i % 10 == 0:
        print(f"Processed {chunk_i*chunksize:,} rows...")

itemids = sorted(set(list(total_counts.keys()) + list(present_counts.keys())))
rows = []
for iid in itemids:
    tot = total_counts.get(iid, 0)
    pres = present_counts.get(iid, 0)
    miss = tot - pres
    frac = pres / tot if tot > 0 else 0.0
    rows.append((iid, tot, pres, miss, frac))

missingness_df = pd.DataFrame(rows, columns=['itemid','total_count','present_count','missing_count','present_fraction'])
missingness_df = missingness_df.sort_values(by='present_fraction', ascending=False).reset_index(drop=True)
missingness_df.to_csv("chartevents_itemid_missingness.csv", index=False)
print("Saved chartevents_itemid_missingness.csv")


In [None]:
df = pd.read_csv("chartevents_itemid_missingness.csv")

drop_ids = df.loc[df['present_fraction'] < 0.5, 'itemid'].tolist()

print(f"تعداد itemid هایی که باید drop بشن: {len(drop_ids)}")
print(drop_ids[:50])

In [None]:
reader = pd.read_csv("chartevents.csv", chunksize=2_000_000)
out_path = "chartevents_missing50_dropped.csv"

first = True
total_rows = 0
total_dropped = 0
total_written = 0

for i, chunk in enumerate(reader, start=1):
    before = len(chunk)
    filtered = chunk.loc[~chunk['itemid'].isin(drop_ids)]
    after = len(filtered)
    
    filtered.to_csv(out_path, mode="w" if first else "a", index=False, header=first)
    first = False
    
    total_rows += before
    total_dropped += before - after
    total_written += after
    print(f"Chunk {i}: rows={before:,}, dropped={before - after:,}, kept={after:,}")

print("---- DONE ----")
print(f"Total rows processed: {total_rows:,}")
print(f"Total rows dropped:   {total_dropped:,}")
print(f"Total rows written:   {total_written:,}")
print("✅ فایل نهایی ذخیره شد:", out_path)

In [None]:
df = pd.read_csv("d_items.csv")

In [None]:
print(df.head(0))

In [None]:
in_path = "d_items.csv"
out_path = "d_items_chartevents_missing50_dropped.csv"

df = pd.read_csv(in_path)

filtered = df.loc[
    (df["linksto"] == "chartevents") & 
    (~df["itemid"].isin(drop_ids))
]

filtered.to_csv(out_path, index=False)

print("✅ d_items filtered and saved:", out_path)
print("before:", len(df), "after:", len(filtered), "drop:", len(df) - len(filtered))

In [None]:
ditems_path = Path("d_items_chartevents_missing50_dropped.csv")
merged_initial_file = Path("merged_initial.csv")
out_path = Path("merged_initial_with_items_cols.csv")

ditems = pd.read_csv(ditems_path, usecols=['itemid'])
itemids = pd.to_numeric(ditems['itemid'], errors='coerce').dropna().astype(int).unique().tolist()
cols_to_add = [str(i) for i in itemids]
print("Will add columns (count):", len(cols_to_add))

try:
    merged_initial
    print("Using merged_initial from memory (existing DataFrame). rows:", len(merged_initial))
except NameError:
    if not merged_initial_file.exists():
        raise FileNotFoundError(f"merged_initial not in memory and file {merged_initial_file} not found.")
    print("Loading merged_initial from disk:", merged_initial_file)
    merged_initial = pd.read_csv(merged_initial_file, low_memory=False, parse_dates=['admittime','dischtime','deathtime','edregtime','edouttime'])
    print("Loaded merged_initial rows:", len(merged_initial))

n_rows = len(merged_initial)
added = 0
for c in cols_to_add:
    if c not in merged_initial.columns:
        merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
        added += 1
print(f"Added {added} new columns. Total columns now: {len(merged_initial.columns)}")

chunksize = 10000
first = True
written = 0
for start in range(0, n_rows, chunksize):
    end = min(start + chunksize, n_rows)
    chunk = merged_initial.iloc[start:end]
    chunk.to_csv(out_path, mode="w" if first else "a", index=False, header=first)
    first = False
    written += len(chunk)
    print(f"Wrote rows {start:,}..{end-1:,} -> {len(chunk):,} rows")
    del chunk
    gc.collect()

print("✅ Done. Output saved to:", out_path)
print("Rows written:", written, "Columns in output:", len(merged_initial.columns))

In [None]:
print(merged_initial.head(3))

In [None]:
chartevents_path = Path("chartevents_missing50_dropped.csv")
ditems_path = Path("d_items_chartevents_missing50_dropped.csv")
merged_initial_file = None
chunksize = 500_000
save_after = False

if not chartevents_path.exists():
    raise FileNotFoundError(chartevents_path)
if not ditems_path.exists():
    raise FileNotFoundError(ditems_path)

ditems = pd.read_csv(ditems_path, usecols=['itemid'])
keep_itemids = pd.to_numeric(ditems['itemid'], errors='coerce').dropna().astype(int).unique().tolist()
keep_itemids_set = set(keep_itemids)
print("Keep itemids count:", len(keep_itemids))

try:
    merged_initial
except NameError:
    if merged_initial_file is None:
        raise NameError("merged_initial not in memory. Set merged_initial_file path or load it.")
    print("Loading merged_initial from disk...")
    merged_initial = pd.read_csv(merged_initial_file, low_memory=False, parse_dates=['admittime'])
    print("loaded merged_initial rows:", len(merged_initial))

for iid in keep_itemids:
    col = str(iid)
    if col not in merged_initial.columns:
        merged_initial[col] = pd.Series([pd.NA] * len(merged_initial), dtype="object")

admit_map = merged_initial.groupby(['subject_id','hadm_id'], dropna=False)['admittime'].first().reset_index().rename(columns={'admittime':'admit_time'})
admit_map['admit_date'] = pd.to_datetime(admit_map['admit_time'], errors='coerce').dt.normalize()
admit_map['key'] = list(zip(admit_map['subject_id'].astype('Int64'), admit_map['hadm_id'].astype('Int64')))
admit_dict = dict(zip(admit_map['key'], admit_map['admit_date']))

merged_initial_index_map = {}
for idx, row in merged_initial[['subject_id','hadm_id','day_index']].iterrows():
    key = (int(row['subject_id']), int(row['hadm_id']), int(row['day_index']))
    merged_initial_index_map[key] = idx

print("Admit map keys:", len(admit_dict), "merged rows map size:", len(merged_initial_index_map))

reader = pd.read_csv(chartevents_path, usecols=['subject_id','hadm_id','itemid','charttime','value','valuenum'],
                     parse_dates=['charttime'], chunksize=chunksize, low_memory=True)

total_assigned = 0
chunk_no = 0

for chunk in reader:
    chunk_no += 1
    print(f"\n--- Processing chunk {chunk_no} (rows: {len(chunk)}) ---")
    chunk['itemid'] = pd.to_numeric(chunk['itemid'], errors='coerce').astype('Int64')
    chunk = chunk[chunk['itemid'].notna()]
    chunk = chunk[chunk['itemid'].isin(keep_itemids)]
    if chunk.empty:
        print("no relevant itemids in this chunk")
        continue

    chunk['subject_id'] = chunk['subject_id'].astype(int)
    chunk['hadm_id'] = chunk['hadm_id'].astype(int)

    def lookup_admit_date(s):
        return admit_dict.get((int(s.subject_id), int(s.hadm_id)), pd.NaT)
    keys = list(zip(chunk['subject_id'].astype(int), chunk['hadm_id'].astype(int)))
    chunk['admit_date'] = [admit_dict.get(k, pd.NaT) for k in keys]

    chunk = chunk[chunk['admit_date'].notna()]
    if chunk.empty:
        print("no rows with admit_date in this chunk")
        continue

    chunk['chart_date'] = chunk['charttime'].dt.normalize()
    chunk['day_index'] = (chunk['chart_date'] - chunk['admit_date']).dt.days.fillna(0).astype(int)
    chunk.loc[chunk['day_index'] < 0, 'day_index'] = 0

    chunk['numeric_val'] = pd.to_numeric(chunk['valuenum'], errors='coerce')
    mask_num_missing = chunk['numeric_val'].isna()
    if mask_num_missing.any():
        parsed = pd.to_numeric(chunk.loc[mask_num_missing, 'value'].astype(str).str.replace(',',''), errors='coerce')
        chunk.loc[mask_num_missing, 'numeric_val'] = parsed

    chunk['value_raw'] = chunk['value'].astype(str)

    grp_keys = ['subject_id','hadm_id','day_index','itemid']

    numeric_rows = chunk[chunk['numeric_val'].notna()].copy()
    if not numeric_rows.empty:
        grp_num = numeric_rows.groupby(grp_keys, as_index=False)['numeric_val'].max()
        grp_num = grp_num.rename(columns={'numeric_val':'agg_value_num'})
    else:
        grp_num = pd.DataFrame(columns=grp_keys + ['agg_value_num'])

    chunk_sorted = chunk.sort_values('charttime')
    grp_last = chunk_sorted.groupby(grp_keys, as_index=False).last()[grp_keys + ['value_raw','charttime']]
    grp_last = grp_last.rename(columns={'value_raw':'agg_value_text', 'charttime':'agg_time_text'})

    merged_grps = pd.merge(grp_last, grp_num, on=grp_keys, how='left')

    def pick_final_val(row):
        if pd.notna(row.get('agg_value_num')):
            return row['agg_value_num']
        else:
            v = row.get('agg_value_text')
            if pd.isna(v) or v in ("nan","None","NoneType","NA","<NA>"):
                return pd.NA
            return v

    merged_grps['final_value'] = merged_grps.apply(pick_final_val, axis=1)

    assigned = 0
    for _, r in merged_grps.iterrows():
        key = (int(r['subject_id']), int(r['hadm_id']), int(r['day_index']))
        row_idx = merged_initial_index_map.get(key)
        if row_idx is None:
            continue
        itemid_col = str(int(r['itemid']))
        val = r['final_value']
        merged_initial.at[row_idx, itemid_col] = val
        assigned += 1

    total_assigned += assigned
    print(f"Chunk {chunk_no}: groups aggregated = {len(merged_grps)}, assigned = {assigned}, total_assigned so far = {total_assigned}")

    del chunk, chunk_sorted, numeric_rows, grp_num, grp_last, merged_grps
    gc.collect()

print("\n--- ALL CHUNKS PROCESSED ---")
print("Total assigned cells:", total_assigned)

out_path = Path("merged_with_chartevents_filled.csv")
n_rows = len(merged_initial)
write_chunk = 20000
first = True
for start in range(0, n_rows, write_chunk):
    end = min(start + write_chunk, n_rows)
    merged_initial.iloc[start:end].to_csv(out_path, mode='w' if first else 'a', index=False, header=first)
    first = False
    print(f"Saved rows {start}-{end-1}")
print("Saved final to:", out_path)

In [None]:
merged_with_chartevents_filled_path = Path("merged_with_chartevents_filled.csv")
merged_with_chartevents_filled = pd.read_csv(merged_with_chartevents_filled_path, nrows=500)


In [None]:
merged_with_chartevents_filled

In [None]:
input_path = Path("chartevents_missing50_dropped.csv")
output_path = Path("chartevents_missing50_dropped_filtered_hadm_id_23282506.csv")

chunksize = 2_000_000

first = True
total_rows = 0

for i, chunk in enumerate(pd.read_csv(input_path, chunksize=chunksize, low_memory=False)):
    filtered = chunk[chunk['hadm_id'] == 23282506]
    if not filtered.empty:
        filtered.to_csv(output_path, mode='w' if first else 'a',
                        index=False, header=first)
        first = False
        total_rows += len(filtered)
        print(f"Chunk {i}: wrote {len(filtered)} rows (total so far: {total_rows})")

print("Done! Final rows written:", total_rows)
print("Output file:", output_path)


In [None]:
ditems_path = Path("d_items_chartevents_missing50_dropped.csv")
in_path = Path("merged_with_chartevents_filled.csv")
out_path = Path("merged_with_chartevents_filled_renamed.csv")
chunksize = 20_000
max_name_len = 80

if not ditems_path.exists():
    raise FileNotFoundError(ditems_path)
if not in_path.exists():
    raise FileNotFoundError(in_path)

d = pd.read_csv(ditems_path, usecols=['itemid','label','abbreviation'], dtype=str)
d['itemid'] = d['itemid'].str.strip()
d['label'] = d['label'].fillna('').astype(str).str.strip()
d['abbreviation'] = d['abbreviation'].fillna('').astype(str).str.strip()

d['chosen'] = d.apply(lambda r: r['abbreviation'] if r['abbreviation']!='' else (r['label'] if r['label']!='' else ''), axis=1)

def sanitize_name(s):
    if pd.isna(s) or s is None:
        return ''
    s = str(s).strip()
    s = re.sub(r'\s+', '_', s)
    s = re.sub(r'[^\w\-]', '', s)
    s = re.sub(r'_+', '_', s)
    s = s[:max_name_len]
    return s

name_map = {}
used = set()

for _, row in d.iterrows():
    iid = row['itemid']
    chosen = row['chosen']
    if chosen == '':
        base = f"item_{iid}"
    else:
        base = sanitize_name(chosen)
        if base == '':
            base = f"item_{iid}"
    name = base
    if name in used:
        name = f"{base}__{iid}"
    counter = 1
    while name in used:
        name = f"{base}__{iid}_{counter}"
        counter += 1
    used.add(name)
    name_map[str(iid)] = name

orig_header = pd.read_csv(in_path, nrows=0).columns.tolist()
new_header = []
conflicts = 0
for col in orig_header:
    new_col = col
    col_str = str(col).strip()
    if col_str in name_map:
        new_col = "chartevents_" + name_map[col_str]
    else:
        try:
            icol = str(int(float(col_str)))
            if icol in name_map:
                new_col = name_map[icol]
        except Exception:
            pass
    if new_col in new_header:
        conflicts += 1
        new_col = f"{new_col}__orig_{sanitize_name(col_str)}"
        k = 1
        while new_col in new_header:
            new_col = f"{new_col}_{k}"; k += 1
    new_header.append(new_col)

print(f"Prepared header mapping. Total cols: {len(orig_header)}, conflicts resolved: {conflicts}")

sample_map = {k: name_map[k] for k in list(name_map)[:10]}
print("sample itemid->name (first 10):", sample_map)

first = True
rows_written = 0
for i, chunk in enumerate(pd.read_csv(in_path, chunksize=chunksize, low_memory=False)):
    chunk.columns = new_header
    chunk.to_csv(out_path, mode='w' if first else 'a', index=False, header=first)
    first = False
    rows_written += len(chunk)
    print(f"Chunk {i+1}: wrote {len(chunk):,} rows (total {rows_written:,})")
    del chunk
    gc.collect()

print("✅ Done. Output saved to:", out_path)
print("Rows written:", rows_written)

In [None]:
merged_with_chartevents_filled_renamed_path = Path("merged_with_chartevents_filled_renamed.csv")
merged_with_chartevents_filled_renamed = pd.read_csv(merged_with_chartevents_filled_renamed_path, nrows=500)


In [None]:
merged_with_chartevents_filled_renamed

In [None]:
datetimeevents_path = Path("datetimeevents.csv")
datetimeevents = pd.read_csv(datetimeevents_path, nrows=1000)

In [None]:
datetimeevents.head(100).to_csv('test_datetimeevents')

In [None]:
datetimeevents.head(5)

In [None]:
microbiologyevents_path = Path("microbiologyevents.csv")
microbiologyevents = pd.read_csv(microbiologyevents_path, nrows=1000)

In [None]:
microbiologyevents.head(100).to_csv('test_microbiologyevents')

In [None]:
# dup

In [None]:
# --- Config ---
DIAGNOSIS_CSV = Path("diagnoses_icd.csv")
DICT_CSV = Path("d_icd_diagnoses.csv")
MERGED_INITIAL_CSV = Path("admissions_expanded.csv")   # input (admission x day)
OUT_CSV = Path("merged_with_diagnoses.csv")      # output
CHUNK_DIAG = 200_000        # chunk size for diagnoses reading (diagnoses table is usually small)
CHUNK_WRITE = 20_000        # chunk size for writing merged_initial with diagnoses
TOP_K = 5                   # produce diag_1..diag_K columns (set to 0 to disable)

# --- Helpers ---
def normalize_code(code):
    """Normalize ICD code strings for matching: str, strip, uppercase, remove dots."""
    if pd.isna(code):
        return ""
    s = str(code).strip().upper()
    s = s.replace(".", "")
    return s

def try_lookup_description(code, version, dict_map):
    """Attempt to find long_title for (code, version) with fallback strategies."""
    key = (code, str(int(version)) if pd.notna(version) else str(version))
    if key in dict_map:
        return dict_map[key]
    # fallback: remove leading zeros from both sides (e.g., '0010' -> '10')
    code_nolead = code.lstrip("0")
    key2 = (code_nolead, key[1])
    if key2 in dict_map:
        return dict_map[key2]
    # fallback: if dict keys have leading zeros and code doesn't, try to left-pad to 4 (common for old formats)
    if code.isdigit():
        for pad in (3,4,5):
            kp = (code.zfill(pad), key[1])
            if kp in dict_map:
                return dict_map[kp]
    return pd.NA

# --- Step 1: load ICD dictionary into memory (small) ---
if not DICT_CSV.exists():
    raise FileNotFoundError(f"{DICT_CSV} not found. Place d_icd_diagnoses.csv next to this script.")

dict_df = pd.read_csv(DICT_CSV, dtype=str)  # icd_code, icd_version, long_title
# normalize dict codes:
dict_df['icd_code_norm'] = dict_df['icd_code'].astype(str).apply(normalize_code)
dict_df['icd_version_norm'] = dict_df['icd_version'].astype(str).str.strip()
# build mapping (code_norm, version) -> long_title
dict_map = dict(((row.icd_code_norm, row.icd_version_norm), row.long_title) for row in dict_df.itertuples(index=False))

print(f"Loaded ICD dictionary rows: {len(dict_df)}")

# --- Step 2: read diagnoses_icd in chunks and accumulate per-admission lists ---
if not DIAGNOSIS_CSV.exists():
    raise FileNotFoundError(f"{DIAGNOSIS_CSV} not found. Place diagnoses_icd.csv next to this script.")

acc = defaultdict(list)   # key -> list of (seq_num (int), icd_code_norm (str), icd_version)
rows_seen = 0
for chunk in pd.read_csv(DIAGNOSIS_CSV, chunksize=CHUNK_DIAG, dtype=str, low_memory=False):
    # ensure required columns exist
    for col in ("subject_id","hadm_id","seq_num","icd_code","icd_version"):
        if col not in chunk.columns:
            raise KeyError(f"Expected column '{col}' in diagnoses_icd.csv but missing.")
    # normalize and iterate
    chunk['subject_id'] = pd.to_numeric(chunk['subject_id'], errors='coerce').astype('Int64')
    chunk['hadm_id'] = pd.to_numeric(chunk['hadm_id'], errors='coerce').astype('Int64')
    chunk['seq_num'] = pd.to_numeric(chunk['seq_num'], errors='coerce').fillna(99999).astype(int)
    chunk['icd_code_norm'] = chunk['icd_code'].astype(str).apply(normalize_code)
    chunk['icd_version_norm'] = chunk['icd_version'].astype(str).str.strip()

    for r in chunk.itertuples(index=False):
        # skip if missing hadm or subject
        if pd.isna(r.subject_id) or pd.isna(r.hadm_id):
            continue
        key = (int(r.subject_id), int(r.hadm_id))
        acc[key].append((int(r.seq_num), r.icd_code_norm, r.icd_version_norm))
        rows_seen += 1
    print(f"Processed diagnoses rows so far: {rows_seen}", end='\r')

print(f"\nTotal diagnosis rows processed: {rows_seen}; unique admissions with diagnoses: {len(acc)}")

# --- Step 3: build per-admission aggregate DataFrame ---
agg_rows = []
for (subj, hadm), entries in acc.items():
    # sort by seq_num ascending
    entries_sorted = sorted(entries, key=lambda x: (x[0] if x[0] is not None else 99999))
    codes = [e[1] for e in entries_sorted if e[1] != ""]
    versions = [e[2] for e in entries_sorted]
    # lookup descriptions (preserve order)
    descs = [ try_lookup_description(c, v, dict_map) if c != "" else pd.NA for c,v in zip(codes, versions) ]
    n = len(codes)
    primary_code = codes[0] if n >= 1 else pd.NA
    primary_desc = descs[0] if n >= 1 else pd.NA
    # top-K split
    top_codes = {}
    top_descs = {}
    for k in range(1, TOP_K+1):
        if n >= k:
            top_codes[f"diag_{k}_code"] = codes[k-1]
            top_descs[f"diag_{k}_desc"] = descs[k-1]
        else:
            top_codes[f"diag_{k}_code"] = pd.NA
            top_descs[f"diag_{k}_desc"] = pd.NA

    agg_rows.append({
        "subject_id": int(subj),
        "hadm_id": int(hadm),
        "diag_n": int(n),
        "diag_codes": ";".join(codes) if codes else pd.NA,
        "diag_descs": ";".join([str(d) for d in descs]) if descs else pd.NA,
        "primary_diag_code": primary_code,
        "primary_diag_desc": primary_desc,
        **top_codes,
        **top_descs
    })

diag_df = pd.DataFrame(agg_rows)
# ensure dtypes
if not diag_df.empty:
    diag_df['subject_id'] = diag_df['subject_id'].astype('Int64')
    diag_df['hadm_id'] = diag_df['hadm_id'].astype('Int64')
    diag_df['diag_n'] = diag_df['diag_n'].astype('Int64')

print("Built diag_df with rows:", len(diag_df))

# --- Step 4: merge diag_df into merged_initial.csv in chunks (so we don't load merged_initial fully) ---
if not MERGED_INITIAL_CSV.exists():
    raise FileNotFoundError(f"{MERGED_INITIAL_CSV} not found. Place merged_initial.csv next to this script.")

first_write = True
written = 0
for chunk in pd.read_csv(MERGED_INITIAL_CSV, chunksize=CHUNK_WRITE, parse_dates=['admittime'], low_memory=False):
    # ensure keys have correct dtype
    if 'subject_id' in chunk.columns:
        chunk['subject_id'] = pd.to_numeric(chunk['subject_id'], errors='coerce').astype('Int64')
    if 'hadm_id' in chunk.columns:
        chunk['hadm_id'] = pd.to_numeric(chunk['hadm_id'], errors='coerce').astype('Int64')

    merged_chunk = chunk.merge(diag_df, on=['subject_id','hadm_id'], how='left')
    # if diag_df is empty, the merge will just add nothing; that's okay.

    merged_chunk.to_csv(OUT_CSV, mode='w' if first_write else 'a', index=False, header=first_write)
    first_write = False
    written += len(merged_chunk)
    print(f"Wrote merged rows: {written}", end='\r')
    del chunk, merged_chunk
    gc.collect()

print(f"\nDone. Output saved to: {OUT_CSV} (rows written: {written})")


In [None]:
merged_initial = pd.read_csv(MERGED_INITIAL_CSV)

In [None]:
merged_initial.shape

In [None]:
merged_with_diagnoses = pd.read_csv("merged_with_diagnoses.csv")

In [None]:
merged_with_diagnoses.shape

In [None]:
merged_with_diagnoses.head(10)

In [None]:
procedures_icd = pd.read_csv("procedures_icd.csv")

In [None]:
procedures_icd.head(5)

In [None]:
d_icd_procedures = pd.read_csv("d_icd_procedures.csv")

In [None]:
d_icd_procedures.head(5)

In [None]:
# === CONFIG ===
procedures_path = Path("procedures_icd.csv")
dprocedures_path = Path("d_icd_procedures.csv")
merged_initial_path = Path("admissions_expanded.csv")
intermediate_chunks_path = Path("procedures_daily_chunks.csv")
final_daily_path = Path("procedures_daily_final.csv")
out_merged_path = Path("merged_with_procedures.csv")

chunksize = 500_000   # tune to your environment
write_chunk = 20000

# === 0) sanity checks ===
for p in (procedures_path, dprocedures_path, merged_initial_path):
    if not p.exists():
        raise FileNotFoundError(f"Required file not found: {p.resolve()}")

# === 1) build admit_date lookup from merged_initial ===
print("Loading merged_initial admissions (admit_time -> admit_date map)...")
mi_cols = ['subject_id', 'hadm_id', 'admittime']
mi = pd.read_csv(merged_initial_path, usecols=mi_cols, parse_dates=['admittime'], low_memory=False)
mi['subject_id'] = pd.to_numeric(mi['subject_id'], errors='coerce').astype('Int64')
mi['hadm_id'] = pd.to_numeric(mi['hadm_id'], errors='coerce').astype('Int64')

# take first admittime per (subject_id, hadm_id)
admit_map = mi.groupby(['subject_id', 'hadm_id'], dropna=False)['admittime'].first().reset_index().rename(columns={'admittime':'admit_time'})
admit_map['admit_date'] = pd.to_datetime(admit_map['admit_time'], errors='coerce').dt.normalize()

# make a dict: (int(subject_id), int(hadm_id)) -> admit_date (Timestamp) for fast lookup
admit_dict = {}
for r in admit_map.itertuples(index=False):
    try:
        key = (int(r.subject_id), int(r.hadm_id))
    except Exception:
        continue
    admit_dict[key] = r.admit_date
print("Admit map size:", len(admit_dict))

del mi, admit_map; gc.collect()

# === 2) chunked read of procedures_icd -> per-chunk daily aggregates ===
print("Streaming procedures_icd in chunks and writing per-chunk daily aggregates...")
first_out = True
total_skipped_no_admit = 0
total_rows_processed = 0
reader = pd.read_csv(procedures_path,
                     usecols=['subject_id','hadm_id','seq_num','chartdate','icd_code','icd_version'],
                     parse_dates=['chartdate'],
                     chunksize=chunksize,
                     low_memory=True)

for chunk_i, chunk in enumerate(reader, start=1):
    total_rows_processed += len(chunk)
    # normalize ids
    chunk['subject_id'] = pd.to_numeric(chunk['subject_id'], errors='coerce').astype('Int64')
    chunk['hadm_id'] = pd.to_numeric(chunk['hadm_id'], errors='coerce').astype('Int64')

    # drop rows with missing ids
    chunk = chunk[chunk['subject_id'].notna() & chunk['hadm_id'].notna()]
    if chunk.empty:
        print(f"Chunk {chunk_i}: no valid subject/hadm ids, skipping")
        continue

    # icd_code as cleaned string
    chunk['icd_code'] = chunk['icd_code'].astype(str).str.strip()
    # map admit_date quickly using vectorized approach via list comprehension (safe for chunk sizes)
    keys = list(zip(chunk['subject_id'].astype(int), chunk['hadm_id'].astype(int)))
    chunk['admit_date'] = [admit_dict.get(k, pd.NaT) for k in keys]

    # drop rows without admit_date (no matching admission in merged_initial)
    missing_admit_mask = chunk['admit_date'].isna()
    n_missing = int(missing_admit_mask.sum())
    total_skipped_no_admit += n_missing
    if n_missing:
        # keep memory low by filtering now
        chunk = chunk.loc[~missing_admit_mask]
    if chunk.empty:
        print(f"Chunk {chunk_i}: {n_missing} rows had no admit_date; chunk empty after drop -> continue")
        continue

    # compute day_index (chart_date normalized minus admit_date), clipped to >= 0
    chunk['chart_date'] = pd.to_datetime(chunk['chartdate'], errors='coerce').dt.normalize()
    chunk['day_index'] = (chunk['chart_date'] - chunk['admit_date']).dt.days.fillna(0).astype(int)
    chunk.loc[chunk['day_index'] < 0, 'day_index'] = 0

    # group per day and aggregate:
    # - proc_count: number of procedure rows that day
    # - last_proc_charttime: most recent chartdate (max)
    # - proc_codes: unique semicolon-separated icd_code strings (sorted)
    def join_unique_codes(series):
        s = set([str(x).strip() for x in series.dropna() if str(x).strip() not in ("", "nan", "None")])
        if not s:
            return ""
        return ";".join(sorted(s))

    grp = chunk.groupby(['subject_id','hadm_id','day_index'], dropna=False)
    df_agg = grp.agg(
        proc_count = ('icd_code', 'size'),
        last_proc_charttime = ('chartdate', 'max'),
        proc_codes = ('icd_code', join_unique_codes)
    ).reset_index()

    # write per-chunk aggregates (append)
    df_agg.to_csv(intermediate_chunks_path, mode='w' if first_out else 'a', index=False, header=first_out)
    first_out = False

    print(f"Chunk {chunk_i}: rows_in={len(chunk):,}, groups_out={len(df_agg):,}, skipped_no_admit={n_missing}")
    del chunk, df_agg, grp
    gc.collect()

print("Streaming done. Total rows processed:", total_rows_processed)
print("Total rows skipped because no matching admission:", total_skipped_no_admit)

# === 3) finalize aggregated daily procedures by grouping intermediate file ===
print("Reading intermediate chunks and final-aggregating...")
if not intermediate_chunks_path.exists():
    raise FileNotFoundError(f"Expected intermediate file {intermediate_chunks_path} not found.")

daily = pd.read_csv(intermediate_chunks_path, parse_dates=['last_proc_charttime'], low_memory=False)

# final aggregation: sum counts, max(last_proc_charttime), union of proc_codes across chunked writes
def union_semicolon_lists(series):
    sset = set()
    for val in series.dropna():
        if val == "":
            continue
        parts = [p.strip() for p in str(val).split(";") if p.strip() != ""]
        sset.update(parts)
    if not sset:
        return ""
    return ";".join(sorted(sset))

final = daily.groupby(['subject_id','hadm_id','day_index'], as_index=False).agg(
    proc_count = ('proc_count', 'sum'),
    last_proc_charttime = ('last_proc_charttime', 'max'),
    proc_codes = ('proc_codes', union_semicolon_lists)
)

final.to_csv(final_daily_path, index=False)
print("Final per-day procedures saved to:", final_daily_path)
del daily; gc.collect()

# === 4) optionally map codes -> titles from d_icd_procedures (if file available) ===
print("Loading d_icd_procedures to map codes -> titles (if available)...")
dproc = pd.read_csv(dprocedures_path, dtype=str, low_memory=False)
dproc['icd_code'] = dproc['icd_code'].astype(str).str.strip()
code2title = dict(zip(dproc['icd_code'], dproc['long_title'].fillna("").astype(str)))

def map_codes_to_titles(codes_str):
    if pd.isna(codes_str) or codes_str == "":
        return ""
    codes = [c for c in codes_str.split(";") if c.strip() != ""]
    titles = [code2title.get(c, "") for c in codes]
    titles = [t for t in titles if t != ""]
    return ";".join(titles)

final['proc_titles'] = final['proc_codes'].apply(map_codes_to_titles)
# Save updated final
final.to_csv(final_daily_path, index=False)
print("Final per-day procedures (with titles) saved to:", final_daily_path)

# === 5) LEFT JOIN final daily procedures into merged_initial and write merged file ===
print("Merging final daily procedure aggregates into merged_initial master table...")
merged = pd.read_csv(merged_initial_path, low_memory=False, parse_dates=['admittime','dischtime','deathtime'])
# ensure types
merged['subject_id'] = pd.to_numeric(merged['subject_id'], errors='coerce').astype('Int64')
merged['hadm_id'] = pd.to_numeric(merged['hadm_id'], errors='coerce').astype('Int64')
merged['day_index'] = pd.to_numeric(merged['day_index'], errors='coerce').astype('Int64')

# load final daily procedures
proc_daily = pd.read_csv(final_daily_path, parse_dates=['last_proc_charttime'], low_memory=False)
proc_daily['subject_id'] = pd.to_numeric(proc_daily['subject_id'], errors='coerce').astype('Int64')
proc_daily['hadm_id'] = pd.to_numeric(proc_daily['hadm_id'], errors='coerce').astype('Int64')
proc_daily['day_index'] = pd.to_numeric(proc_daily['day_index'], errors='coerce').astype('Int64')

# left join
merged_with_proc = merged.merge(proc_daily, on=['subject_id','hadm_id','day_index'], how='left')

# optional: fill NaN counts with 0
merged_with_proc['proc_count'] = merged_with_proc['proc_count'].fillna(0).astype('Int64')
# keep proc_codes and proc_titles as empty string where missing
merged_with_proc['proc_codes'] = merged_with_proc['proc_codes'].fillna("").astype(str)
merged_with_proc['proc_titles'] = merged_with_proc['proc_titles'].fillna("").astype(str)

# write final merged CSV in chunks (to avoid huge memory spikes)
print("Writing final merged file (chunked writes)...")
n_rows = len(merged_with_proc)
first = True
for start in range(0, n_rows, write_chunk):
    end = min(start + write_chunk, n_rows)
    merged_with_proc.iloc[start:end].to_csv(out_merged_path, mode='w' if first else 'a', index=False, header=first)
    first = False
    print(f"Wrote rows {start}..{end-1}")
print("Merged output saved to:", out_merged_path)


In [None]:
merged_initial = pd.read_csv("admissions_expanded.csv")

In [None]:
merged_initial.shape

In [None]:
merged_with_procedures = pd.read_csv("merged_with_procedures.csv")

In [None]:
merged_with_procedures.shape

In [None]:
merged_with_procedures.head(100)