<a href="https://colab.research.google.com/github/mbakos95/ICU-Mortality-Prediction-MIMIC-III-/blob/main/ICU_Mortality_Prediction_%E2%80%93_Notebook_1_Dataset_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Step 1: Install Required Libraries**

In [None]:
!pip install pandas --quiet
!pip install numpy --quiet


# **Step 2: Import Libraries**

In [None]:
import os
import pandas as pd
import numpy as np


# **Step 3: Mount Google Drive and Set Working Directory**


In [None]:
from google.colab import drive
drive.mount('/content/drive')

project_path = "/content/drive/MyDrive/AIDL MASTER/AIDL_02/Final project/ForTheProfessor"
os.chdir(project_path)


Mounted at /content/drive


# **Step 4: Load Raw CSV Files**


In [None]:
discharge_df = pd.read_csv("discharge.csv")
admissions_df = pd.read_csv("admissions.csv")
patients_df = pd.read_csv("patients.csv")

print(" Loaded:", discharge_df.shape, admissions_df.shape, patients_df.shape)


 Loaded: (331793, 8) (431231, 16) (299712, 6)


# **Step 5: Merge Tables and Generate Mortality Labels**


In [None]:
# Merge discharge with admissions to get dischtime
merged_df = discharge_df.merge(admissions_df[['hadm_id', 'dischtime']], on='hadm_id', how='inner')

# Merge with patients to get date of death (dod)
merged_df = merged_df.merge(patients_df[['subject_id', 'dod']], on='subject_id', how='left')

# Convert to datetime
merged_df['dischtime'] = pd.to_datetime(merged_df['dischtime'])
merged_df['dod'] = pd.to_datetime(merged_df['dod'])

# Calculate 30-day mortality label
merged_df['mortality_30d'] = merged_df.apply(
    lambda row: 1 if pd.notnull(row['dod']) and (row['dod'] - row['dischtime']).days <= 30 else 0,
    axis=1
)

print(" Label counts:\n", merged_df['mortality_30d'].value_counts())


 Label counts:
 mortality_30d
0    314099
1     17694
Name: count, dtype: int64


# **Step 6: Filter Notes by Type and Length**


In [None]:
filtered_df = merged_df.copy()

# Keep only Discharge Summaries
filtered_df = filtered_df[filtered_df['note_type'] == 'DS']

# Remove missing or short texts
filtered_df = filtered_df[filtered_df['text'].notnull()]
filtered_df = filtered_df[filtered_df['text'].str.len() > 500]

print("Filtered shape:", filtered_df.shape)


Filtered shape: (331791, 11)


# **Step 7: Balance the Dataset**


In [None]:
# Equal number of class 0 and class 1
df_0 = filtered_df[filtered_df['mortality_30d'] == 0].sample(n=15000, random_state=42)
df_1 = filtered_df[filtered_df['mortality_30d'] == 1].sample(n=15000, random_state=42)

balanced_df = pd.concat([df_0, df_1]).sample(frac=1, random_state=42).reset_index(drop=True)


# **Step 8: Save Final Output as CSV**


In [None]:
# Save only the relevant columns
balanced_df[['subject_id', 'hadm_id', 'text', 'mortality_30d']].to_csv("discharge_balanced.csv", index=False)
print("Saved file: discharge_balanced.csv with shape:", balanced_df.shape)


Saved file: discharge_balanced.csv with shape: (30000, 11)
