In [None]:
# diagnosis_processing_cleaned.py

In [None]:
import pandas as pd
import gc
from icd_mapping import Mapper

In [None]:
# Load Diagnosis Dataset in chunks (all CPU-based)
file_path = "/content/drive/MyDrive/MIMIC/mimic-iv-ed-2.2/ed/diagnosis.csv.gz"
chunk_size = 100000  # Adjust based on memory
chunks = []

In [None]:
for chunk in pd.read_csv(file_path, compression='gzip', usecols=["stay_id", "subject_id", "icd_code", "icd_version"], chunksize=chunk_size):
    chunk["icd_code"] = chunk["icd_code"].astype("category")
    chunk["icd_version"] = chunk["icd_version"].astype("int32")
    chunks.append(chunk)

In [None]:
diagnosis = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()

In [None]:
print("✅ Diagnosis dataset loaded using CPU.")

In [None]:
# Install and use icd-mappings
!pip install -q icd-mappings
from icd_mapping import Mapper
mapper = Mapper()

In [None]:
# Separate ICD9 and ICD10 codes
diagnosis["icd_code_str"] = diagnosis["icd_code"].astype(str)
icd9_mask = diagnosis["icd_version"] == 9
icd10_mask = diagnosis["icd_version"] == 10

In [None]:
# Map ICD9 → ICD10
diagnosis["icd10_mapped"] = None
diagnosis.loc[icd9_mask, "icd10_mapped"] = mapper.map(
    diagnosis.loc[icd9_mask, "icd_code_str"].tolist(),
    source='icd9',
    target='icd10'
)

In [None]:
# Map ICD10 (native and mapped) to chapters
diagnosis["chapter"] = None

In [None]:
# ICD10 native
diagnosis.loc[icd10_mask, "chapter"] = mapper.map(
    diagnosis.loc[icd10_mask, "icd_code_str"].tolist(),
    source='icd10',
    target='chapter'
)

In [None]:
# ICD9 converted to ICD10 → chapter
diagnosis.loc[icd9_mask, "chapter"] = mapper.map(
    diagnosis.loc[icd9_mask, "icd10_mapped"],
    source='icd10',
    target='chapter'
)

In [None]:
# Show example rows
print(diagnosis.head(20))