In [17]:
import pandas as pd
import gc
!pip install icd-mappings
from icdmappings import Mapper
from google.colab import drive

Mounted at /content/drive


In [30]:
# Load Diagnosis Dataset in chunks (all CPU-based)
drive.mount('/content/drive', force_remount=True)
file_path = "/content/drive/MyDrive/MIMIC/mimic-iv-ed-2.2/ed/diagnosis.csv"
chunk_size = 100000  # Adjust based on memory
chunks = []

Mounted at /content/drive


In [31]:
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    chunk["icd_code"] = chunk["icd_code"].astype("category")
    chunk["icd_version"] = chunk["icd_version"].astype("int32")
    chunks.append(chunk)

In [32]:
diagnosis = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()

92

In [33]:
print("Diagnosis dataset loaded using CPU.")

Diagnosis dataset loaded using CPU.


In [34]:
print(diagnosis.head(20))

    subject_id   stay_id  seq_num icd_code  icd_version  \
0     10000032  32952584        1     4589            9   
1     10000032  32952584        2    07070            9   
2     10000032  32952584        3      V08            9   
3     10000032  33258284        1     5728            9   
4     10000032  33258284        2    78959            9   
5     10000032  33258284        3    07070            9   
6     10000032  33258284        4      V08            9   
7     10000032  35968195        1     5715            9   
8     10000032  35968195        2    78900            9   
9     10000032  35968195        3      V08            9   
10    10000032  38112554        1    78959            9   
11    10000032  38112554        2    07070            9   
12    10000032  38112554        3     5715            9   
13    10000032  38112554        4      V08            9   
14    10000032  39399961        1    78097            9   
15    10000032  39399961        2    34830            9 

In [35]:
# Install and use icd-mappings
mapper = Mapper()

In [36]:
# Separate ICD9 and ICD10 codes
diagnosis["icd_code_str"] = diagnosis["icd_code"].astype(str)
icd9_mask = diagnosis["icd_version"] == 9
icd10_mask = diagnosis["icd_version"] == 10

In [37]:
# Map ICD9 → ICD10
diagnosis["icd10_mapped"] = None
diagnosis.loc[icd9_mask, "icd10_mapped"] = mapper.map(
    diagnosis.loc[icd9_mask, "icd_code_str"].tolist(),
    source='icd9',
    target='icd10'
)

In [38]:
# Map ICD10 (native and mapped) to chapters
diagnosis["chapter"] = None

In [39]:
# ICD10 native
diagnosis.loc[icd10_mask, "chapter"] = mapper.map(
    diagnosis.loc[icd10_mask, "icd_code_str"].tolist(),
    source='icd10',
    target='chapter'
)

In [40]:
# ICD9 converted to ICD10 → chapter
diagnosis.loc[icd9_mask, "chapter"] = mapper.map(
    diagnosis.loc[icd9_mask, "icd10_mapped"],
    source='icd10',
    target='chapter'
)

In [41]:
# Show example rows
print(diagnosis.head(20))

    subject_id   stay_id  seq_num icd_code  icd_version  \
0     10000032  32952584        1     4589            9   
1     10000032  32952584        2    07070            9   
2     10000032  32952584        3      V08            9   
3     10000032  33258284        1     5728            9   
4     10000032  33258284        2    78959            9   
5     10000032  33258284        3    07070            9   
6     10000032  33258284        4      V08            9   
7     10000032  35968195        1     5715            9   
8     10000032  35968195        2    78900            9   
9     10000032  35968195        3      V08            9   
10    10000032  38112554        1    78959            9   
11    10000032  38112554        2    07070            9   
12    10000032  38112554        3     5715            9   
13    10000032  38112554        4      V08            9   
14    10000032  39399961        1    78097            9   
15    10000032  39399961        2    34830            9 