In [2]:
# Cell 0: Imports & helper utilities
import pandas as pd
import numpy as np
from pathlib import Path
from math import ceil
from datetime import timedelta
from collections import defaultdict
import re
import os
import gc

pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 120)

print("Ready — imports done.")

Ready — imports done.


In [2]:
# Cell 1: Load admissions.csv (or use sample if file not present) and expand to one row per day (day_index)
csv_path = Path("admissions.csv")

admissions = pd.read_csv(csv_path, low_memory=False,
                            parse_dates=['admittime','dischtime','deathtime','edregtime','edouttime'])
print("Loaded admissions.csv from disk. Rows:", len(admissions))

# Ensure datetime types
admissions['admittime'] = pd.to_datetime(admissions['admittime'], errors='coerce')
admissions['dischtime'] = pd.to_datetime(admissions['dischtime'], errors='coerce')

# If dischtime missing, fill with admittime (so at least one day is produced)
admissions['dischtime'] = admissions['dischtime'].fillna(admissions['admittime'])

# Function to expand a single admission row into day rows
def expand_admission_row(row):
    adm_date = row['admittime'].normalize().date()
    dis_date = row['dischtime'].normalize().date()
    n_days = (dis_date - adm_date).days + 1
    if n_days <= 0:
        n_days = 1
    rows = []
    for d in range(n_days):
        new = {
            'subject_id': row['subject_id'],
            'hadm_id': row['hadm_id'],
            'day_index': int(d),
            'admittime': row['admittime'],
            'dischtime': row['dischtime'],
            'deathtime': row['deathtime'],
            'admission_type': row.get('admission_type', np.nan),
            'admit_provider_id': row.get('admit_provider_id', np.nan),
            'admission_location': row.get('admission_location', np.nan),
            'discharge_location': row.get('discharge_location', np.nan),
            'insurance': row.get('insurance', np.nan),
            'language': row.get('language', np.nan),
            'marital_status': row.get('marital_status', np.nan),
            'race': row.get('race', np.nan),
            'edregtime': row.get('edregtime', pd.NaT),
            'edouttime': row.get('edouttime', pd.NaT),
            'hospital_expire_flag': row.get('hospital_expire_flag', np.nan)
        }
        rows.append(new)
    return rows

# Expand all admissions
expanded = []
for _, r in admissions.iterrows():
    expanded.extend(expand_admission_row(r))

merged_initial = pd.DataFrame(expanded)
# Convert types
merged_initial[['subject_id','hadm_id','day_index']] = merged_initial[['subject_id','hadm_id','day_index']].astype('Int64')

print("Expanded admissions -> rows:", merged_initial.shape[0])


Loaded admissions.csv from disk. Rows: 21987
Expanded admissions -> rows: 325800


In [None]:
# Cell 2: Attach icustays -> mark ICU presence per (subject_id, hadm_id, day_index)
# - Preserves merged_initial unchanged
# - Produces merged_with_icu with ICU columns filled when row_date falls inside an icu stay

icu_path = Path("icustays.csv")
if not icu_path.exists():
    raise FileNotFoundError(f"icustays.csv not found at {icu_path.resolve()}  -- put the file next to admissions.csv")

# load icustays (parse datetimes)
icustays = pd.read_csv(icu_path, low_memory=False, parse_dates=['intime','outtime'])

# normalize / required columns (tolerant if some optional cols missing)
for col in ['subject_id','hadm_id','intime','outtime']:
    if col not in icustays.columns:
        raise KeyError(f"Expected column '{col}' in icustays.csv but it is missing.")

# optional helpful columns: stay_id, first_careunit, last_careunit, los
optional_cols = ['stay_id','first_careunit','last_careunit','los']
for c in optional_cols:
    if c not in icustays.columns:
        icustays[c] = pd.NA  # create if missing so downstream code is simpler

# --- prepare merged copy (do not modify merged_initial in-place) ---
merged_with_icu = merged_initial.copy().reset_index(drop=False).rename(columns={'index':'row_id'})
# add ICU output columns (keeps original names minimal)
for col in ['stay_id_icu','icustay_intime','icustay_outtime','first_careunit_icu','last_careunit_icu','los_icu']:
    if col not in merged_with_icu.columns:
        merged_with_icu[col] = pd.NA

# compute the date of each merged row (admission day + day_index)
# assume admittime exists and day_index present
if 'admittime' not in merged_with_icu.columns:
    raise KeyError("merged_initial must contain 'admittime' column")

# ensure admittime is datetime
merged_with_icu['admittime'] = pd.to_datetime(merged_with_icu['admittime'], errors='coerce')
merged_with_icu['day_index_int'] = merged_with_icu['day_index'].fillna(0).astype(int)
merged_with_icu['row_date'] = merged_with_icu['admittime'].dt.normalize() + pd.to_timedelta(merged_with_icu['day_index_int'], unit='D')

# normalize icu intime/outtime to dates (fill missing outtime with intime)
icustays['intime'] = pd.to_datetime(icustays['intime'], errors='coerce')
icustays['outtime'] = pd.to_datetime(icustays['outtime'], errors='coerce').fillna(icustays['intime'])
icustays['intime_norm'] = icustays['intime'].dt.normalize()
icustays['outtime_norm'] = icustays['outtime'].dt.normalize()

# create a merged candidate set joining by subject_id & hadm_id (many-to-many)
# keep the icu's key cols for matching and assignment
icu_keep = ['subject_id','hadm_id','stay_id','intime','outtime','intime_norm','outtime_norm','first_careunit','last_careunit','los']
candidate = merged_with_icu.merge(icustays[icu_keep], on=['subject_id','hadm_id'], how='left', suffixes=('','_icu'))

# mark rows where row_date is within icu interval
mask_in_icu = (candidate['row_date'] >= candidate['intime_norm']) & (candidate['row_date'] <= candidate['outtime_norm'])
candidate['in_icu'] = mask_in_icu.fillna(False)

# For rows that match multiple ICU stays, pick the earliest ICU (by intime)
# keep only matches; then for duplicates keep the one with smallest intime
matched = candidate[candidate['in_icu']].copy()
if not matched.empty:
    # sort so earliest ICU intime appears first for each original row
    matched = matched.sort_values(by=['row_id','intime'])
    # pick first match per row_id
    first_matches = matched.groupby('row_id', as_index=False).first()
    # map the ICU fields back into merged_with_icu by row_id
    map_cols = {
        'stay_id':'stay_id_icu',
        'intime':'icustay_intime',
        'outtime':'icustay_outtime',
        'first_careunit':'first_careunit_icu',
        'last_careunit':'last_careunit_icu',
        'los':'los_icu'
    }
    for src, dst in map_cols.items():
        # build mapping series
        mapping = first_matches.set_index('row_id')[src]
        merged_with_icu.loc[merged_with_icu['row_id'].isin(mapping.index), dst] = merged_with_icu.loc[merged_with_icu['row_id'].isin(mapping.index), 'row_id'].map(mapping)
    assigned_count = len(first_matches)
else:
    assigned_count = 0

# cleanup helper columns
merged_with_icu = merged_with_icu.drop(columns=['day_index_int','row_date'])

print(f"ICU assignment complete. Rows where ICU info filled: {int(assigned_count)}")
# preview some assigned rows (if any)
print(merged_with_icu[merged_with_icu['stay_id_icu'].notna()].head(20))

# merged_with_icu is ready for next steps (icustays merged)
# note: merged_initial remains unchanged


In [None]:
# --- فقط تغییر اسم و ترتیب ستون‌های ICU ---

# تغییر نام ستون‌ها
rename_map = {
    'stay_id_icu': 'stay_id',
    'first_careunit_icu': 'first_careunit',
    'last_careunit_icu': 'last_careunit',
    'icustay_intime': 'icustays_intime',
    'icustay_outtime': 'icustays_outtime',
    'los_icu': 'los'
}
merged_with_icu = merged_with_icu.rename(columns=rename_map)

# جابجایی ترتیب ستون‌های ICU
icu_cols_ordered = ['stay_id', 'first_careunit', 'last_careunit', 'icustays_intime', 'icustays_outtime', 'los']

# بقیه ستون‌ها (به جز ICU)
other_cols = [c for c in merged_with_icu.columns if c not in icu_cols_ordered]

# بازآرایی نهایی
merged_with_icu = merged_with_icu[other_cols + icu_cols_ordered]

print("Renaming & reordering complete.")
print(merged_with_icu[icu_cols_ordered].head(20))


In [None]:
merged_initial

In [None]:
merged_with_icu

In [None]:
merged_initial.to_csv('merged_initial.csv')

In [None]:
merged_with_icu.to_csv('merged_with_icu.csv')

In [None]:
merged_with_icu.head(100).to_csv('merged_with_icu_sample.csv')

In [None]:
# Cell X: merge all_vanco.csv into merged_with_icu (or merged_initial if icu-step not present)
vanco_path = Path("all_vanco.csv")
if not vanco_path.exists():
    raise FileNotFoundError(f"all_vanco.csv not found at {vanco_path.resolve()}")

# load and parse
all_vanco = pd.read_csv(vanco_path, low_memory=False, parse_dates=['charttime'])

# normalize ids to integers (some hadm_id may have .0)
all_vanco['subject_id'] = pd.to_numeric(all_vanco['subject_id'], errors='coerce').astype('Int64')
all_vanco['hadm_id'] = pd.to_numeric(all_vanco['hadm_id'], errors='coerce').astype('Int64')

# helper: resolve numeric value for comparison
def resolve_numeric(row):
    v = row.get('value')
    vn = row.get('valuenum')
    # treat missing-like tokens as missing
    if pd.isna(v) or str(v).strip() in ['', '___', 'NaN', 'nan']:
        try:
            return float(vn) if not pd.isna(vn) else np.nan
        except:
            return np.nan
    # try parse value (may contain commas, spaces)
    s = str(v).strip().replace(',', '')
    try:
        return float(s)
    except:
        # fallback to valuenum
        try:
            return float(vn) if not pd.isna(vn) else np.nan
        except:
            return np.nan

all_vanco['resolved_val'] = all_vanco.apply(resolve_numeric, axis=1)

# Choose base merged DF to join into (prefer merged_with_icu if exists)


# ensure admittime exists and is datetime
merged_with_vanco = merged_with_icu.copy()
if 'admittime' not in merged_with_vanco.columns:
    raise KeyError(f"merged_with_vanco must contain 'admittime' column before merging labs.")
merged_with_vanco['admittime'] = pd.to_datetime(merged_with_vanco['admittime'], errors='coerce')

# build admit lookup (one admittime per subject_id,hadm_id)
admit_map = merged_with_vanco.groupby(['subject_id','hadm_id'], dropna=False)['admittime'].first().reset_index().rename(columns={'admittime':'admit_time'})
admit_map['admit_date'] = pd.to_datetime(admit_map['admit_time']).dt.normalize()

# merge admit_date into all_vanco to compute day index of each lab
all_vanco = all_vanco.merge(admit_map[['subject_id','hadm_id','admit_date']], on=['subject_id','hadm_id'], how='left')

# if admit_date missing -> we cannot compute day_index -> drop those rows (or keep with NaN day)
missing_admit = all_vanco['admit_date'].isna().sum()
if missing_admit:
    print(f"Warning: {missing_admit} all_vanco rows have no matching admission (admit_date missing) and will be skipped.")
all_vanco = all_vanco[all_vanco['admit_date'].notna()].copy()

# compute lab day index relative to admission day (day0 = admittime.normalize())
all_vanco['chart_date'] = pd.to_datetime(all_vanco['charttime'], errors='coerce').dt.normalize()
all_vanco['day_index_lab'] = (all_vanco['chart_date'] - all_vanco['admit_date']).dt.days.fillna(0).astype(int)
# clamp negative days to 0
all_vanco.loc[all_vanco['day_index_lab'] < 0, 'day_index_lab'] = 0

# For each (subject, hadm, day_index_lab) pick the row with maximum resolved_val
group_cols = ['subject_id','hadm_id','day_index_lab']
# drop rows where resolved_val is NaN (no usable numeric) — they won't contribute to max
usable = all_vanco[~all_vanco['resolved_val'].isna()].copy()
if usable.empty:
    print("No usable numeric vanco values found to aggregate.")
    # create empty daily_vanco with expected cols
    daily_vanco = pd.DataFrame(columns=['subject_id','hadm_id','day_index_lab',
                                       'charttime','value','valuenum','valueuom','flag','resolved_val'])
else:
    idx = usable.groupby(group_cols)['resolved_val'].idxmax()
    daily_vanco = usable.loc[idx].copy()

# rename columns to DBML names
daily_vanco = daily_vanco.rename(columns={
    'charttime':'all_vanco_charttime',
    'value':'all_vanco_value',
    'valuenum':'all_vanco_valuenum',
    'valueuom':'all_vanco_valueuom',
    'flag':'all_vanco_flag',
    'day_index_lab':'day_index'
})

# keep only needed cols for merging
merge_cols = ['subject_id','hadm_id','day_index',
              'all_vanco_charttime','all_vanco_value','all_vanco_valuenum','all_vanco_valueuom','all_vanco_flag']
daily_vanco = daily_vanco[merge_cols]

# ensure types align
daily_vanco['subject_id'] = daily_vanco['subject_id'].astype('Int64')
daily_vanco['hadm_id'] = daily_vanco['hadm_id'].astype('Int64')
daily_vanco['day_index'] = daily_vanco['day_index'].astype('Int64')

# merge into base (left join so all base rows remain); prefer existing base as left
merged_with_vanco = merged_with_vanco.merge(daily_vanco, on=['subject_id','hadm_id','day_index'], how='left')

print(f"all_vanco merged -> rows with vanco info: {int(merged_with_vanco['all_vanco_charttime'].notna().sum())}")

# quick preview (first 20 rows that got vanco info)
preview = merged_with_vanco[merged_with_vanco['all_vanco_charttime'].notna()].head(20)
print(preview[['subject_id','hadm_id','day_index',
               'all_vanco_charttime','all_vanco_value','all_vanco_valuenum','all_vanco_valueuom','all_vanco_flag']])


In [None]:
merged_with_vanco.head(100).to_csv('merged_with_vanco_sample.csv')

In [None]:
chartevents_path = Path("chartevents.csv")
chartevents = pd.read_csv(chartevents_path, nrows=1000)


In [None]:
pd.read_csv(chartevents_path,nrows=100)

In [None]:
pd.set_option("display.max_rows", None)   # همه ردیف‌ها
pd.set_option("display.max_columns", None)  # همه ستون‌ها
pd.set_option("display.width", None)        # عرض رو محدود نکن
pd.set_option("display.max_colwidth", None) # طول رشته ستون رو کامل نشون بده

In [None]:
print(chartevents.head(100))

In [None]:
# chunked missingness (FIXED)
chartevents_path = Path("chartevents.csv")
if not chartevents_path.exists():
    raise FileNotFoundError(f"chartevents.csv not found at {chartevents_path.resolve()}")

usecols = ['subject_id','hadm_id','itemid','charttime','value','valuenum']
chunksize = 200_000

total_counts = defaultdict(int)
present_counts = defaultdict(int)

reader = pd.read_csv(chartevents_path, usecols=usecols, chunksize=chunksize, low_memory=True)

chunk_i = 0
for chunk in reader:
    chunk_i += 1
    chunk['itemid'] = pd.to_numeric(chunk['itemid'], errors='coerce').astype('Int64')
    chunk = chunk[chunk['itemid'].notna()]
    if chunk.empty:
        continue

    # present mask: True if valuenum exists
    present_mask = ~chunk['valuenum'].isna()

    # rows that need textual check (valuenum is NaN)
    need_check = chunk['valuenum'].isna()
    if need_check.any():
        vals = chunk.loc[need_check, 'value'].astype(str).str.strip()
        good = ~vals.isin(["", "___", "NaN", "nan", "None", "none"])
        # <-- FIX: use boolean mask indexing, not need_check.index
        present_mask.loc[need_check] = good.values

    # aggregate counts per itemid for this chunk
    grp_total = chunk.groupby('itemid').size()
    # <-- FIX: group the boolean series by the corresponding itemid values and sum
    grp_present = present_mask.groupby(chunk['itemid']).sum()

    for item, cnt in grp_total.items():
        total_counts[int(item)] += int(cnt)
    for item, cnt in grp_present.items():
        if pd.isna(item):
            continue
        present_counts[int(item)] += int(cnt)

    if chunk_i % 10 == 0:
        print(f"Processed {chunk_i*chunksize:,} rows...")

# build results DataFrame (unchanged from before)
itemids = sorted(set(list(total_counts.keys()) + list(present_counts.keys())))
rows = []
for iid in itemids:
    tot = total_counts.get(iid, 0)
    pres = present_counts.get(iid, 0)
    miss = tot - pres
    frac = pres / tot if tot > 0 else 0.0
    rows.append((iid, tot, pres, miss, frac))

missingness_df = pd.DataFrame(rows, columns=['itemid','total_count','present_count','missing_count','present_fraction'])
missingness_df = missingness_df.sort_values(by='present_fraction', ascending=False).reset_index(drop=True)
missingness_df.to_csv("chartevents_itemid_missingness.csv", index=False)
print("Saved chartevents_itemid_missingness.csv")


In [None]:
df = pd.read_csv("chartevents_itemid_missingness.csv")

# شرط: کمتر از 50 درصد داده موجود
drop_ids = df.loc[df['present_fraction'] < 0.5, 'itemid'].tolist()

print(f"تعداد itemid هایی که باید drop بشن: {len(drop_ids)}")
print(drop_ids[:50])  # برای اینکه فقط ۵۰ تا اولی رو ببینی

In [None]:
reader = pd.read_csv("chartevents.csv", chunksize=2_000_000)
out_path = "chartevents_missing50_dropped.csv"

first = True
total_rows = 0
total_dropped = 0
total_written = 0

for i, chunk in enumerate(reader, start=1):
    before = len(chunk)
    filtered = chunk.loc[~chunk['itemid'].isin(drop_ids)]
    after = len(filtered)
    
    # ذخیره به فایل
    filtered.to_csv(out_path, mode="w" if first else "a", index=False, header=first)
    first = False
    
    # لاگ
    total_rows += before
    total_dropped += before - after
    total_written += after
    print(f"Chunk {i}: rows={before:,}, dropped={before - after:,}, kept={after:,}")

print("---- DONE ----")
print(f"Total rows processed: {total_rows:,}")
print(f"Total rows dropped:   {total_dropped:,}")
print(f"Total rows written:   {total_written:,}")
print("✅ فایل نهایی ذخیره شد:", out_path)

In [None]:
df = pd.read_csv("d_items.csv")

In [None]:
print(df.head(0))

In [None]:
# فایل ورودی/خروجی
in_path = "d_items.csv"
out_path = "d_items_chartevents_missing50_dropped.csv"

# فرض می‌کنیم drop_ids رو قبلاً ساختی
# drop_ids = [...]

# خواندن d_items
df = pd.read_csv(in_path)

# فیلتر کردن
filtered = df.loc[
    (df["linksto"] == "chartevents") & 
    (~df["itemid"].isin(drop_ids))
]

# ذخیره فایل نهایی
filtered.to_csv(out_path, index=False)

print("✅ d_items filtered and saved:", out_path)
print("before:", len(df), "after:", len(filtered), "drop:", len(df) - len(filtered))

In [3]:
# مسیرها / اسم فایل خروجی
ditems_path = Path("d_items_chartevents_missing50_dropped.csv")
merged_initial_file = Path("merged_initial.csv")   # اگر merged_initial در حافظه نیست از این خوانده می‌شود
out_path = Path("merged_initial_with_items_cols.csv")

# 1) load itemids
ditems = pd.read_csv(ditems_path, usecols=['itemid'])
itemids = pd.to_numeric(ditems['itemid'], errors='coerce').dropna().astype(int).unique().tolist()
cols_to_add = [str(i) for i in itemids]   # ستون‌ها به صورت رشته نامیده می‌شوند
print("Will add columns (count):", len(cols_to_add))

# 2) ensure merged_initial exists (either in memory or read from CSV)
try:
    merged_initial  # اگر در نوت‌بوک تعریف شده باشه از حافظه استفاده می‌کنیم
    print("Using merged_initial from memory (existing DataFrame). rows:", len(merged_initial))
except NameError:
    if not merged_initial_file.exists():
        raise FileNotFoundError(f"merged_initial not in memory and file {merged_initial_file} not found.")
    print("Loading merged_initial from disk:", merged_initial_file)
    merged_initial = pd.read_csv(merged_initial_file, low_memory=False, parse_dates=['admittime','dischtime','deathtime','edregtime','edouttime'])
    print("Loaded merged_initial rows:", len(merged_initial))

# 3) add columns (only those missing)
n_rows = len(merged_initial)
added = 0
for c in cols_to_add:
    if c not in merged_initial.columns:
        # مقدار اولیه را pd.NA قرار می‌دهیم؛ dtype فعلی object خواهد بود (قابل نگهداری اعداد/متن)
        merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
        added += 1
print(f"Added {added} new columns. Total columns now: {len(merged_initial.columns)}")

# 4) ذخیره چانک‌چانک (برای جلوگیری از مصرف زیاد حافظه/IO spike)
chunksize = 10000   # می‌تونی این عدد رو کم/زیاد کنی؛ اگر رم کم داری کوچکتر کن
first = True
written = 0
for start in range(0, n_rows, chunksize):
    end = min(start + chunksize, n_rows)
    chunk = merged_initial.iloc[start:end]
    chunk.to_csv(out_path, mode="w" if first else "a", index=False, header=first)
    first = False
    written += len(chunk)
    print(f"Wrote rows {start:,}..{end-1:,} -> {len(chunk):,} rows")
    # پاکسازی کوچک برای آزاد کردن حافظه
    del chunk
    gc.collect()

print("✅ Done. Output saved to:", out_path)
print("Rows written:", written, "Columns in output:", len(merged_initial.columns))

Will add columns (count): 2999
Using merged_initial from memory (existing DataFrame). rows: 325800


  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_initial[c] = pd.Series([pd.NA] * n_rows, dtype="object")
  merged_i

Added 2999 new columns. Total columns now: 3016
Wrote rows 0..9,999 -> 10,000 rows
Wrote rows 10,000..19,999 -> 10,000 rows
Wrote rows 20,000..29,999 -> 10,000 rows
Wrote rows 30,000..39,999 -> 10,000 rows
Wrote rows 40,000..49,999 -> 10,000 rows
Wrote rows 50,000..59,999 -> 10,000 rows
Wrote rows 60,000..69,999 -> 10,000 rows
Wrote rows 70,000..79,999 -> 10,000 rows
Wrote rows 80,000..89,999 -> 10,000 rows
Wrote rows 90,000..99,999 -> 10,000 rows
Wrote rows 100,000..109,999 -> 10,000 rows
Wrote rows 110,000..119,999 -> 10,000 rows
Wrote rows 120,000..129,999 -> 10,000 rows
Wrote rows 130,000..139,999 -> 10,000 rows
Wrote rows 140,000..149,999 -> 10,000 rows
Wrote rows 150,000..159,999 -> 10,000 rows
Wrote rows 160,000..169,999 -> 10,000 rows
Wrote rows 170,000..179,999 -> 10,000 rows
Wrote rows 180,000..189,999 -> 10,000 rows
Wrote rows 190,000..199,999 -> 10,000 rows
Wrote rows 200,000..209,999 -> 10,000 rows
Wrote rows 210,000..219,999 -> 10,000 rows
Wrote rows 220,000..229,999 -> 1

In [5]:
print(merged_initial.head(3))

   subject_id   hadm_id  day_index           admittime           dischtime deathtime admission_type admit_provider_id  \
0    10000935  26381316          0 2187-08-23 21:22:00 2187-08-27 15:35:00       NaT       EW EMER.            P52V4D   
1    10000935  26381316          1 2187-08-23 21:22:00 2187-08-27 15:35:00       NaT       EW EMER.            P52V4D   
2    10000935  26381316          2 2187-08-23 21:22:00 2187-08-27 15:35:00       NaT       EW EMER.            P52V4D   

  admission_location        discharge_location insurance language marital_status                    race  \
0     EMERGENCY ROOM  SKILLED NURSING FACILITY  Medicare  English         SINGLE  BLACK/AFRICAN AMERICAN   
1     EMERGENCY ROOM  SKILLED NURSING FACILITY  Medicare  English         SINGLE  BLACK/AFRICAN AMERICAN   
2     EMERGENCY ROOM  SKILLED NURSING FACILITY  Medicare  English         SINGLE  BLACK/AFRICAN AMERICAN   

            edregtime           edouttime  hospital_expire_flag 220001 220045 2200

In [None]:
# ---------- params ----------
chartevents_path = Path("chartevents_missing50_dropped.csv")
ditems_path = Path("d_items_chartevents_missing50_dropped.csv")
# merged_initial already in memory (as per your notebook). If not, set merged_initial_file and read it.
merged_initial_file = None   # e.g. "merged_initial.csv" if needed; otherwise merged_initial must exist
chunksize = 500_000         # تعداد ردیف برای هر chunk از chartevents (کمتر کن اگر رم کم داری)
save_after = False          # اگر True می‌خواهد پس از هر chunk intermediate ذخیره کنیم (معمولاً False کافی است)
# ----------------------------

# sanity checks
if not chartevents_path.exists():
    raise FileNotFoundError(chartevents_path)
if not ditems_path.exists():
    raise FileNotFoundError(ditems_path)

# load keep itemids
ditems = pd.read_csv(ditems_path, usecols=['itemid'])
keep_itemids = pd.to_numeric(ditems['itemid'], errors='coerce').dropna().astype(int).unique().tolist()
keep_itemids_set = set(keep_itemids)
print("Keep itemids count:", len(keep_itemids))

# ensure merged_initial present (either in memory or read from file)
try:
    merged_initial  # noqa: F821
except NameError:
    if merged_initial_file is None:
        raise NameError("merged_initial not in memory. Set merged_initial_file path or load it.")
    print("Loading merged_initial from disk...")
    merged_initial = pd.read_csv(merged_initial_file, low_memory=False, parse_dates=['admittime'])
    print("loaded merged_initial rows:", len(merged_initial))

# ensure the itemid columns exist in merged_initial
# (you said you already added them — if not, add with pd.NA)
for iid in keep_itemids:
    col = str(iid)
    if col not in merged_initial.columns:
        merged_initial[col] = pd.Series([pd.NA] * len(merged_initial), dtype="object")

# build admit_map for computing day_index quickly: (subject_id,hadm_id) -> admit_date (normalized date)
admit_map = merged_initial.groupby(['subject_id','hadm_id'], dropna=False)['admittime'].first().reset_index().rename(columns={'admittime':'admit_time'})
admit_map['admit_date'] = pd.to_datetime(admit_map['admit_time'], errors='coerce').dt.normalize()
# turn into dict for fast lookup
admit_map['key'] = list(zip(admit_map['subject_id'].astype('Int64'), admit_map['hadm_id'].astype('Int64')))
admit_dict = dict(zip(admit_map['key'], admit_map['admit_date']))

# build mapping from (subject_id,hadm_id,day_index) -> row index in merged_initial
# assume combination unique (one row per day per hadm)
merged_initial_index_map = {}
for idx, row in merged_initial[['subject_id','hadm_id','day_index']].iterrows():
    key = (int(row['subject_id']), int(row['hadm_id']), int(row['day_index']))
    merged_initial_index_map[key] = idx

print("Admit map keys:", len(admit_dict), "merged rows map size:", len(merged_initial_index_map))

# Reader for chartevents
reader = pd.read_csv(chartevents_path, usecols=['subject_id','hadm_id','itemid','charttime','value','valuenum'],
                     parse_dates=['charttime'], chunksize=chunksize, low_memory=True)

total_assigned = 0
chunk_no = 0

for chunk in reader:
    chunk_no += 1
    print(f"\n--- Processing chunk {chunk_no} (rows: {len(chunk)}) ---")
    # normalize itemid and filter to wanted ones
    chunk['itemid'] = pd.to_numeric(chunk['itemid'], errors='coerce').astype('Int64')
    chunk = chunk[chunk['itemid'].notna()]
    chunk = chunk[chunk['itemid'].isin(keep_itemids)]
    if chunk.empty:
        print("no relevant itemids in this chunk")
        continue

    # quick type conversions
    chunk['subject_id'] = chunk['subject_id'].astype(int)
    chunk['hadm_id'] = chunk['hadm_id'].astype(int)

    # map admit_date via admit_dict (faster than merge)
    def lookup_admit_date(s):
        return admit_dict.get((int(s.subject_id), int(s.hadm_id)), pd.NaT)
    # vectorized-ish: create key tuples and map using dict
    keys = list(zip(chunk['subject_id'].astype(int), chunk['hadm_id'].astype(int)))
    chunk['admit_date'] = [admit_dict.get(k, pd.NaT) for k in keys]

    # drop rows without admit_date
    chunk = chunk[chunk['admit_date'].notna()]
    if chunk.empty:
        print("no rows with admit_date in this chunk")
        continue

    # compute chart_date (normalized) and day_index relative to admit_date
    chunk['chart_date'] = chunk['charttime'].dt.normalize()
    chunk['day_index'] = (chunk['chart_date'] - chunk['admit_date']).dt.days.fillna(0).astype(int)
    chunk.loc[chunk['day_index'] < 0, 'day_index'] = 0

    # resolved numeric value: prefer valuenum if present, else try numeric parse of 'value'
    # create numeric_val (float) and raw_value (str)
    chunk['numeric_val'] = pd.to_numeric(chunk['valuenum'], errors='coerce')
    # for rows where numeric_val is NaN, try parse from value string
    mask_num_missing = chunk['numeric_val'].isna()
    if mask_num_missing.any():
        parsed = pd.to_numeric(chunk.loc[mask_num_missing, 'value'].astype(str).str.replace(',',''), errors='coerce')
        chunk.loc[mask_num_missing, 'numeric_val'] = parsed

    chunk['value_raw'] = chunk['value'].astype(str)

    # GROUP AGGREGATION:
    # keys: subject_id, hadm_id, day_index, itemid
    grp_keys = ['subject_id','hadm_id','day_index','itemid']

    # 1) groups that have numeric values -> take numeric max per group
    numeric_rows = chunk[chunk['numeric_val'].notna()].copy()
    if not numeric_rows.empty:
        grp_num = numeric_rows.groupby(grp_keys, as_index=False)['numeric_val'].max()
        grp_num = grp_num.rename(columns={'numeric_val':'agg_value_num'})
    else:
        grp_num = pd.DataFrame(columns=grp_keys + ['agg_value_num'])

    # 2) for groups that have no numeric val, take last text by charttime
    # To compute last by charttime: sort then group.last()
    chunk_sorted = chunk.sort_values('charttime')
    grp_last = chunk_sorted.groupby(grp_keys, as_index=False).last()[grp_keys + ['value_raw','charttime']]
    grp_last = grp_last.rename(columns={'value_raw':'agg_value_text', 'charttime':'agg_time_text'})

    # 3) combine: if grp has numeric in grp_num use that, else use grp_last
    # merge grp_last with grp_num to know which groups have numeric
    merged_grps = pd.merge(grp_last, grp_num, on=grp_keys, how='left')

    # build final aggregated DataFrame for this chunk
    # if agg_value_num notna -> use that else use agg_value_text
    def pick_final_val(row):
        if pd.notna(row.get('agg_value_num')):
            return row['agg_value_num']
        else:
            # if text 'nan' or 'None' control
            v = row.get('agg_value_text')
            if pd.isna(v) or v in ("nan","None","NoneType","NA","<NA>"):
                return pd.NA
            return v

    merged_grps['final_value'] = merged_grps.apply(pick_final_val, axis=1)

    # 4) assign values into merged_initial using mapping dict merged_initial_index_map
    assigned = 0
    for _, r in merged_grps.iterrows():
        key = (int(r['subject_id']), int(r['hadm_id']), int(r['day_index']))
        row_idx = merged_initial_index_map.get(key)
        if row_idx is None:
            # no matching admission-day row in merged_initial (possible) -> skip
            continue
        itemid_col = str(int(r['itemid']))
        val = r['final_value']
        # write to DataFrame cell (in-place)
        # convert floats that are integer-like to native python types optional
        merged_initial.at[row_idx, itemid_col] = val
        assigned += 1

    total_assigned += assigned
    print(f"Chunk {chunk_no}: groups aggregated = {len(merged_grps)}, assigned = {assigned}, total_assigned so far = {total_assigned}")

    # cleanup
    del chunk, chunk_sorted, numeric_rows, grp_num, grp_last, merged_grps
    gc.collect()

print("\n--- ALL CHUNKS PROCESSED ---")
print("Total assigned cells:", total_assigned)

# finally, save merged_initial to CSV in row-chunks to avoid memory spikes
out_path = Path("merged_with_chartevents_filled.csv")
n_rows = len(merged_initial)
write_chunk = 20000
first = True
for start in range(0, n_rows, write_chunk):
    end = min(start + write_chunk, n_rows)
    merged_initial.iloc[start:end].to_csv(out_path, mode='w' if first else 'a', index=False, header=first)
    first = False
    print(f"Saved rows {start}-{end-1}")
print("Saved final to:", out_path) # Saved final to: merged_with_chartevents_filled.csv (43m)

Keep itemids count: 2999
Admit map keys: 21987 merged rows map size: 325800

--- Processing chunk 1 (rows: 500000) ---
Chunk 1: groups aggregated = 102253, assigned = 102235, total_assigned so far = 102235

--- Processing chunk 2 (rows: 500000) ---
Chunk 2: groups aggregated = 102197, assigned = 102178, total_assigned so far = 204413

--- Processing chunk 3 (rows: 500000) ---
Chunk 3: groups aggregated = 105008, assigned = 104894, total_assigned so far = 309307

--- Processing chunk 4 (rows: 500000) ---
Chunk 4: groups aggregated = 99542, assigned = 99542, total_assigned so far = 408849

--- Processing chunk 5 (rows: 500000) ---
Chunk 5: groups aggregated = 97295, assigned = 97272, total_assigned so far = 506121

--- Processing chunk 6 (rows: 500000) ---
Chunk 6: groups aggregated = 102254, assigned = 102254, total_assigned so far = 608375

--- Processing chunk 7 (rows: 500000) ---
Chunk 7: groups aggregated = 104146, assigned = 103984, total_assigned so far = 712359

--- Processing ch

In [13]:
merged_with_chartevents_filled_path = Path("merged_with_chartevents_filled.csv")
merged_with_chartevents_filled = pd.read_csv(merged_with_chartevents_filled_path, nrows=500)


  merged_with_chartevents_filled = pd.read_csv(merged_with_chartevents_filled_path, nrows=500)


In [14]:
merged_with_chartevents_filled

Unnamed: 0,subject_id,hadm_id,day_index,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag,220001,220045,220046,220047,220048,220050,220051,220052,220056,220058,220059,220060,220061,220063,220066,220069,220072,220073,220074,220088,220120,220125,220128,220179,220180,220181,220194,220210,220218,220224,220227,220228,220235,220245,220274,220277,220283,220292,220293,220339,220507,220541,220545,...,230102,230103,230105,230106,230107,230108,230109,230110,230111,230112,230113,230114,230115,230116,230117,230118,230123,230125,230126,230127,230128,230129,230130,230131,230132,230134,230135,230136,230137,230138,230140,230144,230145,230146,230147,230148,230149,230150,230151,230152,230153,230154,230155,230156,230157,230159,230160,230161,230162,230163,230164,230165,230166,230167,230168,230169,230170,230171,230176,230177
0,10000935,26381316,0,2187-08-23 21:22:00,2187-08-27 15:35:00,,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,10000935,26381316,1,2187-08-23 21:22:00,2187-08-27 15:35:00,,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10000935,26381316,2,2187-08-23 21:22:00,2187-08-27 15:35:00,,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,10000935,26381316,3,2187-08-23 21:22:00,2187-08-27 15:35:00,,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,10000935,26381316,4,2187-08-23 21:22:00,2187-08-27 15:35:00,,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,10016742,29281842,4,2178-07-03 21:13:00,2178-07-08 20:20:00,,EW EMER.,P61W98,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicaid,English,SINGLE,BLACK/AFRICAN AMERICAN,2178-07-03 17:39:00,2178-07-03 22:45:00,0,.Care Plan - Altered Respiratory Function: Ine...,87.0,130.0,50.0,SR (Sinus Rhythm),,,,,,,,,,,,,,,,,,,172.0,118.0,130.0,,23.0,,,,10.4,,159.0,,100.0,,4.0,21.0,5.0,,,33.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
496,10016742,29281842,5,2178-07-03 21:13:00,2178-07-08 20:20:00,,EW EMER.,P61W98,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicaid,English,SINGLE,BLACK/AFRICAN AMERICAN,2178-07-03 17:39:00,2178-07-03 22:45:00,0,,86.0,125.0,50.0,SR (Sinus Rhythm),,,,,,,,,,,,,,,,,,,155.0,98.0,108.0,,26.0,,,,9.8,,165.0,,100.0,,4.0,21.0,5.0,,,30.5,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
497,10017285,23282506,0,2166-05-05 14:59:00,2166-05-14 15:43:00,,EW EMER.,P276OU,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Medicaid,Portuguese,MARRIED,HISPANIC OR LATINO,2166-05-05 13:17:00,2166-05-05 16:40:00,0,Subdural hemorrhage (SDH),76.0,120.0,60.0,SR (Sinus Rhythm),,,,,,,,,,,,,,,,,,,135.0,75.0,90.0,,15.0,,,,9.4,,,,100.0,,4.0,16.0,5.0,,,30.7,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
498,10017285,23282506,1,2166-05-05 14:59:00,2166-05-14 15:43:00,,EW EMER.,P276OU,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Medicaid,Portuguese,MARRIED,HISPANIC OR LATINO,2166-05-05 13:17:00,2166-05-05 16:40:00,0,Subdural hemorrhage (SDH),96.0,120.0,60.0,SR (Sinus Rhythm),,,,,,,,,,,,,,,,,,,142.0,63.0,81.0,,26.0,,217.0,,8.9,51.0,,,100.0,,4.0,16.0,10.0,,,28.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [15]:
input_path = Path("chartevents_missing50_dropped.csv")
output_path = Path("chartevents_missing50_dropped_filtered_hadm_id_23282506.csv")

chunksize = 2_000_000  # می‌تونی تغییر بدی

first = True
total_rows = 0

for i, chunk in enumerate(pd.read_csv(input_path, chunksize=chunksize, low_memory=False)):
    filtered = chunk[chunk['hadm_id'] == 23282506]
    if not filtered.empty:
        filtered.to_csv(output_path, mode='w' if first else 'a',
                        index=False, header=first)
        first = False
        total_rows += len(filtered)
        print(f"Chunk {i}: wrote {len(filtered)} rows (total so far: {total_rows})")

print("Done! Final rows written:", total_rows)
print("Output file:", output_path)


Chunk 0: wrote 7949 rows (total so far: 7949)
Done! Final rows written: 7949
Output file: chartevents_missing50_dropped_filtered_hadm_id_23282506.csv


In [3]:
# ---------- paths ----------
ditems_path = Path("d_items_chartevents_missing50_dropped.csv")
in_path = Path("merged_with_chartevents_filled.csv")   # ورودی بزرگ
out_path = Path("merged_with_chartevents_filled_renamed.csv")  # خروجی جدید
# ---------- params ----------
chunksize = 20_000   # تعداد ردیف برای هر chunk خواندن/نوشتن (کمتر کن اگر رم کمتر داری)
max_name_len = 80    # حداکثر طول برای نام ستون (اختیاری)
# ------------------------

if not ditems_path.exists():
    raise FileNotFoundError(ditems_path)
if not in_path.exists():
    raise FileNotFoundError(in_path)

# ---------- 1) بساز نگاشت itemid -> desired_name (abbrev if present else label) ----------
d = pd.read_csv(ditems_path, usecols=['itemid','label','abbreviation'], dtype=str)
# clean whitespace
d['itemid'] = d['itemid'].str.strip()
d['label'] = d['label'].fillna('').astype(str).str.strip()
d['abbreviation'] = d['abbreviation'].fillna('').astype(str).str.strip()

# choose abbreviation if present else label
d['chosen'] = d.apply(lambda r: r['abbreviation'] if r['abbreviation']!='' else (r['label'] if r['label']!='' else ''), axis=1)

def sanitize_name(s):
    """Make header-safe, short, and deterministic string."""
    if pd.isna(s) or s is None:
        return ''
    s = str(s).strip()
    # replace whitespace with underscore
    s = re.sub(r'\s+', '_', s)
    # remove characters except letters, numbers, underscore, dash
    s = re.sub(r'[^\w\-]', '', s)
    # collapse multiple underscores
    s = re.sub(r'_+', '_', s)
    s = s[:max_name_len]
    return s

# build initial mapping and ensure uniqueness
name_map = {}    # key: itemid as string, value: final column name
used = set()

for _, row in d.iterrows():
    iid = row['itemid']
    chosen = row['chosen']
    if chosen == '':
        base = f"item_{iid}"
    else:
        base = sanitize_name(chosen)
        if base == '':
            base = f"item_{iid}"
    # ensure unique: if collision append __<itemid>
    name = base
    if name in used:
        name = f"{base}__{iid}"
    # as fallback, if still collision (very unlikely), append counter
    counter = 1
    while name in used:
        name = f"{base}__{iid}_{counter}"
        counter += 1
    used.add(name)
    name_map[str(iid)] = name

# ---------- 2) read header of input and prepare final new_columns list ----------
orig_header = pd.read_csv(in_path, nrows=0).columns.tolist()
new_header = []
conflicts = 0
for col in orig_header:
    # if column is an itemid (digits only) and exists in mapping -> replace
    new_col = col
    col_str = str(col).strip()
    # try exact match using numeric-like names (most itemid columns are digits strings)
    if col_str in name_map:
        new_col = name_map[col_str]
    else:
        # try converting to int then str (covers possible leading zeros or int types)
        try:
            icol = str(int(float(col_str)))  # safe convert e.g. '220045.0' -> '220045'
            if icol in name_map:
                new_col = name_map[icol]
        except Exception:
            pass
    # ensure no duplicate among new_header; if duplicate, append suffix with original col id
    if new_col in new_header:
        conflicts += 1
        new_col = f"{new_col}__orig_{sanitize_name(col_str)}"
        # still ensure uniqueness
        k = 1
        while new_col in new_header:
            new_col = f"{new_col}_{k}"; k += 1
    new_header.append(new_col)

print(f"Prepared header mapping. Total cols: {len(orig_header)}, conflicts resolved: {conflicts}")

# ---------- show a small sample of mapping (optional) ----------
sample_map = {k: name_map[k] for k in list(name_map)[:10]}
print("sample itemid->name (first 10):", sample_map)

# ---------- 3) stream through input file in chunks, set df.columns = new_header and write ----------
first = True
rows_written = 0
for i, chunk in enumerate(pd.read_csv(in_path, chunksize=chunksize, low_memory=False)):
    # assign new header (chunk.columns matches orig_header length)
    chunk.columns = new_header
    # write
    chunk.to_csv(out_path, mode='w' if first else 'a', index=False, header=first)
    first = False
    rows_written += len(chunk)
    print(f"Chunk {i+1}: wrote {len(chunk):,} rows (total {rows_written:,})")
    # cleanup
    del chunk
    gc.collect()

print("✅ Done. Output saved to:", out_path)
print("Rows written:", rows_written)

Prepared header mapping. Total cols: 3016, conflicts resolved: 0
sample itemid->name (first 10): {'220001': 'Problem_List', '220045': 'HR', '220046': 'HR_Alarm_-_High', '220047': 'HR_Alarm_-_Low', '220048': 'Heart_Rhythm', '220050': 'ABPs', '220051': 'ABPd', '220052': 'ABPm', '220056': 'ABP_Alarm_-_Low', '220058': 'ABP_Alarm_-_High'}
Chunk 1: wrote 20,000 rows (total 20,000)
Chunk 2: wrote 20,000 rows (total 40,000)
Chunk 3: wrote 20,000 rows (total 60,000)
Chunk 4: wrote 20,000 rows (total 80,000)
Chunk 5: wrote 20,000 rows (total 100,000)
Chunk 6: wrote 20,000 rows (total 120,000)
Chunk 7: wrote 20,000 rows (total 140,000)
Chunk 8: wrote 20,000 rows (total 160,000)
Chunk 9: wrote 20,000 rows (total 180,000)
Chunk 10: wrote 20,000 rows (total 200,000)
Chunk 11: wrote 20,000 rows (total 220,000)
Chunk 12: wrote 20,000 rows (total 240,000)
Chunk 13: wrote 20,000 rows (total 260,000)
Chunk 14: wrote 20,000 rows (total 280,000)
Chunk 15: wrote 20,000 rows (total 300,000)
Chunk 16: wrote 2

In [5]:
merged_with_chartevents_filled_renamed_path = Path("merged_with_chartevents_filled_renamed.csv")
merged_with_chartevents_filled_renamed = pd.read_csv(merged_with_chartevents_filled_renamed_path, nrows=500)


  merged_with_chartevents_filled_renamed = pd.read_csv(merged_with_chartevents_filled_renamed_path, nrows=500)


In [6]:
merged_with_chartevents_filled_renamed

Unnamed: 0,subject_id,hadm_id,day_index,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag,Problem_List,HR,HR_Alarm_-_High,HR_Alarm_-_Low,Heart_Rhythm,ABPs,ABPd,ABPm,ABP_Alarm_-_Low,ABP_Alarm_-_High,PAPs,PAPd,PAPm,PAP_Alarm_-_High,PAP_Alarm_-_Low,LAP,CVP_Alarm_-_High,CVP_Alarm_-_Low,CVP,CO_thermodilution,Intra_Aortic_Ballon_Pump_Setting,LVAD,RVAD,NBPs,NBPd,NBPm,Temp_AV_interval,RR,VC,PO2_Arterial,SaO2,Hemoglobin,PCO2_Arterial,CO2_production,PH_Venous,SpO2,Resistance,MV_Alarm_-_Low,MV_Alarm_-_High,PEEP_set,ACT,ZFibrinogen,Hematocrit_serum,...,NPi_-_Diff,Pupil_Size_-_Left,Pupil_Size_-_Right,Pupil_Size_-_Diff,Minnesota_Tube_Mark,Placement_Confirmed_MT,Stomach_Balloon_Status_MT,Stomach_Suction_Status_MT,Esophageal_Balloon_Status_MT,Esophageal_Balloon_Pressure_MT,Esophageal_Suction_Status_MT,Device_in_Traction_MT,CVL_SecurePort_IV,Sedatives,Analgesics,Vasoactive,Paramaters_evaluated,HD_SecurePort_IV,CI_SecurePort_IV,PICC_SecurePort_IV,TUN_SecurePort_IV,MAC_SecurePort_IV,Chest_SecurePort_IV,Suction_events_L,Suction_events_R,Internal_FMS_Care,Safety_Related_to,Safe_Room,Safe_Patient,Safe_Staff,Unable_to_assess_reproductive,Pain_Level_Acceptable_PreIntervention,P01,Blocker_Type,Blocker_Mark,Blocker_External_Location,Blocker_Secured,Blocker_Balloon_Status,Blocker_Volume,CFS-HR,CFS-RR,CFS-SBP,CFS-Temp,CFS-Sweating,CFS-Posturing,Oxymetry_location_ECMO,Oxymetry_location_CH,StO2_R_ECMO,StO2_R_CH,StO2_L_CH,StO2_L_ECMO,Pint_CH,Pven_Alarm_High_CH,Pint_Alarm_Hi_CH,Delta_P_ECMO,Delta_P_Alarm_ECMO,Part_Alarm_Hi_ECMO,Pint_Alarm_Hi_ECMO,IUC_Stabilization_Device,CRRT_-_Filter_Type
0,10000935,26381316,0,2187-08-23 21:22:00,2187-08-27 15:35:00,,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,10000935,26381316,1,2187-08-23 21:22:00,2187-08-27 15:35:00,,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10000935,26381316,2,2187-08-23 21:22:00,2187-08-27 15:35:00,,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,10000935,26381316,3,2187-08-23 21:22:00,2187-08-27 15:35:00,,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,10000935,26381316,4,2187-08-23 21:22:00,2187-08-27 15:35:00,,EW EMER.,P52V4D,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,English,SINGLE,BLACK/AFRICAN AMERICAN,2187-08-23 14:37:00,2187-08-23 22:46:00,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,10016742,29281842,4,2178-07-03 21:13:00,2178-07-08 20:20:00,,EW EMER.,P61W98,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicaid,English,SINGLE,BLACK/AFRICAN AMERICAN,2178-07-03 17:39:00,2178-07-03 22:45:00,0,.Care Plan - Altered Respiratory Function: Ine...,87.0,130.0,50.0,SR (Sinus Rhythm),,,,,,,,,,,,,,,,,,,172.0,118.0,130.0,,23.0,,,,10.4,,159.0,,100.0,,4.0,21.0,5.0,,,33.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
496,10016742,29281842,5,2178-07-03 21:13:00,2178-07-08 20:20:00,,EW EMER.,P61W98,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicaid,English,SINGLE,BLACK/AFRICAN AMERICAN,2178-07-03 17:39:00,2178-07-03 22:45:00,0,,86.0,125.0,50.0,SR (Sinus Rhythm),,,,,,,,,,,,,,,,,,,155.0,98.0,108.0,,26.0,,,,9.8,,165.0,,100.0,,4.0,21.0,5.0,,,30.5,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
497,10017285,23282506,0,2166-05-05 14:59:00,2166-05-14 15:43:00,,EW EMER.,P276OU,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Medicaid,Portuguese,MARRIED,HISPANIC OR LATINO,2166-05-05 13:17:00,2166-05-05 16:40:00,0,Subdural hemorrhage (SDH),76.0,120.0,60.0,SR (Sinus Rhythm),,,,,,,,,,,,,,,,,,,135.0,75.0,90.0,,15.0,,,,9.4,,,,100.0,,4.0,16.0,5.0,,,30.7,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
498,10017285,23282506,1,2166-05-05 14:59:00,2166-05-14 15:43:00,,EW EMER.,P276OU,EMERGENCY ROOM,CHRONIC/LONG TERM ACUTE CARE,Medicaid,Portuguese,MARRIED,HISPANIC OR LATINO,2166-05-05 13:17:00,2166-05-05 16:40:00,0,Subdural hemorrhage (SDH),96.0,120.0,60.0,SR (Sinus Rhythm),,,,,,,,,,,,,,,,,,,142.0,63.0,81.0,,26.0,,217.0,,8.9,51.0,,,100.0,,4.0,16.0,10.0,,,28.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
