In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = "/Users/mariehumbertdroz/Documents/Data/mimic-iv-3.1" 

In [None]:
def load_mimic_tables():
    """Load the essential MIMIC-IV tables for readmission prediction"""
    
    print("Loading MIMIC-IV tables...")
    
    # Core tables we need
    tables = {
        'admissions': f'{DATA_PATH}/hosp/admissions.csv',
        'patients': f'{DATA_PATH}/hosp/patients.csv',
        'diagnoses_icd': f'{DATA_PATH}/hosp/diagnoses_icd.csv',
        #'procedures_icd': f'{DATA_PATH}/hosp/procedures_icd.csv',
        'labevents': f'{DATA_PATH}/hosp/labevents.csv',
        #'prescriptions': f'{DATA_PATH}/hosp/prescriptions.csv',
    }
    
    data = {}
    
    for table_name, file_path in tables.items():
        try:
            print(f"Loading {table_name}...")
            data[table_name] = pd.read_csv(file_path)
            print(f"Shape: {data[table_name].shape}")
        except FileNotFoundError:
            print(f"Could not find {file_path}")
            data[table_name] = None
    
    return data

In [5]:
mimic_data = load_mimic_tables()

Loading MIMIC-IV tables...
Loading admissions...
Shape: (546028, 16)
Loading patients...
Shape: (364627, 6)
Loading diagnoses_icd...
Shape: (6364488, 5)
Loading procedures_icd...
Shape: (859655, 6)
Loading labevents...
Shape: (158374764, 16)
Loading prescriptions...
Shape: (20292611, 21)


In [None]:
def preprocess_admissions_data(admissions_df):
    """Preprocess the admissions table to add length of stay"""
    
    print(f"Total admissions: {len(admissions_df):,}")
    print(f"Unique patients: {admissions_df['subject_id'].nunique():,}")
    
    # Convert datetime columns
    admissions_df['admittime'] = pd.to_datetime(admissions_df['admittime'])
    admissions_df['dischtime'] = pd.to_datetime(admissions_df['dischtime'])
    
    # Calculate length of stay
    admissions_df['los_days'] = (admissions_df['dischtime'] - admissions_df['admittime']).dt.days
    
    print(f"Average length of stay: {admissions_df['los_days'].mean():.1f} days")
    print(f"Median length of stay: {admissions_df['los_days'].median():.1f} days")
    
    # Show admission types
    print("\nAdmission types:")
    print(admissions_df['admission_type'].value_counts())
    
    return admissions_df

In [7]:
admissions = preprocess_admissions_data(mimic_data['admissions'])

Total admissions: 546,028
Unique patients: 223,452
Average length of stay: 4.2 days
Median length of stay: 2.0 days

Admission types:
admission_type
EW EMER.                       177459
EU OBSERVATION                 119456
OBSERVATION ADMIT               84437
URGENT                          54929
SURGICAL SAME DAY ADMISSION     42898
DIRECT OBSERVATION              24551
DIRECT EMER.                    21973
ELECTIVE                        13130
AMBULATORY OBSERVATION           7195
Name: count, dtype: int64


In [8]:
admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag,los_days
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P49AFC,TRANSFER FROM HOSPITAL,HOME,Medicaid,English,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P784FA,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0,1
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P19UTS,EMERGENCY ROOM,HOSPICE,Medicaid,English,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0,1
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P06OTX,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0,2
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P39NWO,EMERGENCY ROOM,,,English,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0,0


In [None]:
def create_tutorial_subset(admissions_df, n_patients=100000, random_state=42):
    """
    Create a representative subset for tutorial purposes
    
    Parameters:
    - admissions_df: Full admissions dataframe
    - n_patients: Number of patients to sample (default: 100k)
    - random_state: For reproducibility
    """
    
    # Get unique patients
    all_patients = admissions_df['subject_id'].unique()
    print(f"Total patients available: {len(all_patients):,}")
    
    # Sample patients (not admissions) to maintain patient history
    np.random.seed(random_state)
    selected_patients = np.random.choice(
        all_patients, 
        size=min(n_patients, len(all_patients)), 
        replace=False
    )
    
    # Filter admissions to selected patients
    subset_admissions = admissions_df[
        admissions_df['subject_id'].isin(selected_patients)
    ].copy()
    
    print(f"Selected patients: {len(selected_patients):,}")
    print(f"Selected admissions: {len(subset_admissions):,}")
    print(f"Avg admissions per patient: {len(subset_admissions)/len(selected_patients):.1f}")
    
    # Check readmission rate is preserved
    if 'readmitted' in subset_admissions.columns:
        readmit_rate = subset_admissions['readmitted'].mean()
        print(f"Readmission rate in subset: {readmit_rate:.2%}")
    
    return subset_admissions, selected_patients




🎯 Creating Tutorial-Friendly Dataset


In [None]:
# Create subset for tutorial
admissions_subset, selected_patients = create_tutorial_subset(
    admissions, 
    n_patients=100000,
    random_state=42
)

📊 Creating tutorial subset: 100,000 patients
--------------------------------------------------
Total patients available: 223,452
Selected patients: 100,000
Selected admissions: 244,240
Avg admissions per patient: 2.4


In [None]:
def create_readmission_target(admissions_df, readmission_days=30):
    """
    Create binary target variable for readmission within specified days
    
    Parameters:
    - admissions_df: DataFrame with admission records
    - readmission_days: Number of days to define readmission (default: 30)
    """
    
    print(f"Creating {readmission_days}-day readmission target...")
    
    # Sort by patient and admission time
    admissions_sorted = admissions_df.sort_values(['subject_id', 'admittime']).copy()
    
    # For each admission, check if there's another admission within readmission_days
    admissions_sorted['readmitted'] = 0
    
    # Group by patient to find readmissions
    for subject_id, group in admissions_sorted.groupby('subject_id'):
        if len(group) > 1:  # Patient has multiple admissions
            group_sorted = group.sort_values('admittime')
            
            for i in range(len(group_sorted) - 1):
                current_discharge = group_sorted.iloc[i]['dischtime']
                next_admission = group_sorted.iloc[i + 1]['admittime']
                
                # Calculate days between discharge and next admission
                if pd.notna(current_discharge) and pd.notna(next_admission):
                    days_diff = (next_admission - current_discharge).days
                    
                    if 0 <= days_diff <= readmission_days:
                        # Mark current admission as having readmission
                        hadm_id = group_sorted.iloc[i]['hadm_id']
                        admissions_sorted.loc[
                            admissions_sorted['hadm_id'] == hadm_id, 'readmitted'
                        ] = 1
    
    readmission_rate = admissions_sorted['readmitted'].mean()
    print(f"{readmission_days}-day readmission rate: {readmission_rate:.2%}")
    print(f"Readmissions: {admissions_sorted['readmitted'].sum():,}")
    print(f"No readmissions: {(admissions_sorted['readmitted'] == 0).sum():,}")
    
    return admissions_sorted

In [12]:
# Create readmission target
admissions_with_target = create_readmission_target(admissions_subset, readmission_days=30)

Creating 30-day readmission target...
  ✓ 30-day readmission rate: 20.38%
  ✓ Readmissions: 49,781
  ✓ No readmissions: 194,459


In [13]:
admissions_with_target.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag,los_days,readmitted
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P49AFC,TRANSFER FROM HOSPITAL,HOME,Medicaid,English,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0,0,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P784FA,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0,1,1
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P06OTX,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0,2,1
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P19UTS,EMERGENCY ROOM,HOSPICE,Medicaid,English,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0,1,0
8,10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,,EU OBSERVATION,P47EY8,EMERGENCY ROOM,,Medicaid,English,DIVORCED,WHITE,2181-11-14 21:51:00,2181-11-15 09:57:00,0,0,0


In [None]:
def filter_related_tables(mimic_data, selected_patients):
    """Filter all MIMIC tables to selected patients"""
    
    print("Filtering related tables to selected patients...")
    
    filtered_data = {}
    
    for table_name, df in mimic_data.items():
        if df is not None and 'subject_id' in df.columns:
            filtered_df = df[df['subject_id'].isin(selected_patients)].copy()
            filtered_data[table_name] = filtered_df
            
            reduction = (1 - len(filtered_df)/len(df)) * 100
            print(f"{table_name}: {len(df):,} --> {len(filtered_df):,} ({reduction:.1f}% reduction)")
        else:
            filtered_data[table_name] = df
    
    return filtered_data



In [15]:
# Filter all tables
mimic_data_subset = filter_related_tables(mimic_data, selected_patients)

Filtering related tables to selected patients...
  admissions: 546,028 → 244,240 (55.3% reduction)
  patients: 364,627 → 100,000 (72.6% reduction)
  diagnoses_icd: 6,364,488 → 2,849,938 (55.2% reduction)
  procedures_icd: 859,655 → 384,987 (55.2% reduction)
  labevents: 158,374,764 → 67,578,075 (57.3% reduction)
  prescriptions: 20,292,611 → 9,081,341 (55.2% reduction)


In [None]:
def process_demographics(patients_df, admissions_df):
    """Extract and process patient demographic features"""
    
    if patients_df is None or admissions_df is None:
        return None
    
    print("Processing demographics...")

    
    # Merge with admissions to calculate age at admission
    demo_features = admissions_df.merge(
        patients_df[['subject_id', 'gender', 'anchor_age']], 
        on='subject_id', 
        how='left'
    )
    # age at admission is already available in MIMIC-IV
    demo_features['age_at_admission'] = demo_features['anchor_age']
    
    # Create age groups
    demo_features['age_group'] = pd.cut(
        demo_features['age_at_admission'], 
        bins=[0, 18, 35, 50, 65, 80, 100], 
        labels=['<18', '18-35', '35-50', '50-65', '65-80', '80+']
    )
    
    # Gender encoding (F=0, M=1)
    demo_features['gender_male'] = (demo_features['gender'] == 'M').astype(int)
    
    print(f"Age statistics:")
    print(f"Mean age: {demo_features['age_at_admission'].mean():.1f} years")
    print(f"Median age: {demo_features['age_at_admission'].median():.1f} years")
    
    print(f"Gender distribution:")
    print(demo_features['gender'].value_counts())
    
    return demo_features


In [17]:
# Process demographics
demographics = process_demographics(mimic_data_subset['patients'], admissions_with_target)

Processing demographics...
  ✓ Age statistics:
    Mean age: 57.1 years
    Median age: 59.0 years
  ✓ Gender distribution:
gender
F    126346
M    117894
Name: count, dtype: int64


In [18]:
demographics.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,...,edregtime,edouttime,hospital_expire_flag,los_days,readmitted,gender,anchor_age,age_at_admission,age_group,gender_male
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P49AFC,TRANSFER FROM HOSPITAL,HOME,Medicaid,...,2180-05-06 19:17:00,2180-05-06 23:30:00,0,0,0,F,52,52,50-65,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P784FA,EMERGENCY ROOM,HOME,Medicaid,...,2180-06-26 15:54:00,2180-06-26 21:31:00,0,1,1,F,52,52,50-65,0
2,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P06OTX,EMERGENCY ROOM,HOME,Medicaid,...,2180-07-23 05:54:00,2180-07-23 14:00:00,0,2,1,F,52,52,50-65,0
3,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P19UTS,EMERGENCY ROOM,HOSPICE,Medicaid,...,2180-08-05 20:58:00,2180-08-06 01:44:00,0,1,0,F,52,52,50-65,0
4,10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,,EU OBSERVATION,P47EY8,EMERGENCY ROOM,,Medicaid,...,2181-11-14 21:51:00,2181-11-15 09:57:00,0,0,0,F,48,48,35-50,0


In [None]:
from icdmappings import Mapper

def process_diagnoses(diagnoses_df, top_n=50):
    """
    Extract top diagnosis categories as features, mapping ICD-9 to ICD-10
    
    Parameters:
    - diagnoses_df: DataFrame with ICD diagnosis codes
    - top_n: Number of top diagnoses to include as features
    """
    
    print(f"Processing diagnoses with ICD-9 to ICD-10 mapping...")

    icd_mapper = Mapper()
    
    # Create working copy
    diagnoses_mapped = diagnoses_df.copy()
    
    # Check ICD version distribution
    icd9_count = diagnoses_mapped[diagnoses_mapped['icd_version'] == 9].shape[0]
    icd10_count = diagnoses_mapped[diagnoses_mapped['icd_version'] == 10].shape[0]
    
    print(f"  ✓ Original distribution: ICD-9: {icd9_count:,}, ICD-10: {icd10_count:,}")
    
    # Map ICD-9 codes to ICD-10 where possible
    mapped_count = 0
    for idx, row in diagnoses_mapped.iterrows():
        if row['icd_version'] == 9:
            diagnoses_mapped.at[idx, 'icd_code'] = icd_mapper.map(row['icd_code'], source='icd9', target='icd10')
            diagnoses_mapped.at[idx, 'icd_version'] = 10
            mapped_count += 1
    
    print(f"  ✓ Mapped {mapped_count:,} ICD-9 codes to ICD-10")
    
    # For unmapped ICD-9 codes, we'll keep them but prefix with "ICD9_"
    # This way we don't lose information but can distinguish versions
    # This should not happen with this library
    unmapped_mask = diagnoses_mapped['icd_version'] == 9
    diagnoses_mapped.loc[unmapped_mask, 'icd_code'] = 'ICD9_' + diagnoses_mapped.loc[unmapped_mask, 'icd_code']
    
    unmapped_count = unmapped_mask.sum()
    print(f"Kept {unmapped_count:,} unmapped ICD-9 codes with 'ICD9_' prefix")
    
    # Now get top diagnoses from the processed codes
    top_diagnoses = diagnoses_mapped['icd_code'].value_counts().head(top_n).index
    
    print(f"Selected top {top_n} diagnoses after mapping")
    
    # Create binary features for each top diagnosis
    diagnosis_features = []
    
    for hadm_id, group in diagnoses_mapped.groupby('hadm_id'):
        # Create a row for this admission
        row = {'hadm_id': hadm_id}
        
        admission_diagnoses = set(group['icd_code'].tolist())
        
        for diagnosis in top_diagnoses:
            # Clean column name for better readability
            col_name = f'diag_{str(diagnosis).replace(".", "_")}'
            row[col_name] = int(diagnosis in admission_diagnoses)
        
        diagnosis_features.append(row)
    
    diagnosis_df = pd.DataFrame(diagnosis_features)
    
    print(f"Created {len(top_diagnoses)} diagnosis features")
    print(f"Coverage: {len(diagnosis_df)} admissions")
    
    # Show most common diagnoses after mapping
    print(f"Top 5 diagnoses after ICD mapping:")
    for i, diag in enumerate(top_diagnoses[:5]):
        count = diagnoses_mapped[diagnoses_mapped['icd_code'] == diag].shape[0]
        diag_type = "ICD-10" if not diag.startswith('ICD9_') else "ICD-9"
        print(f"    {i+1}. {diag} ({diag_type}): {count:,} cases")
    
    return diagnosis_df

In [20]:
diagnosis_features = process_diagnoses(mimic_data_subset['diagnoses_icd'], top_n=30)

Processing diagnoses with ICD-9 to ICD-10 mapping...
  ✓ Original distribution: ICD-9: 1,299,507, ICD-10: 1,550,431
  ✓ Mapped 1,299,507 ICD-9 codes to ICD-10
  ✓ Kept 0 unmapped ICD-9 codes with 'ICD9_' prefix
  ✓ Selected top 30 diagnoses after mapping
  ✓ Created 30 diagnosis features
  ✓ Coverage: 243991 admissions
  ✓ Top 5 diagnoses after ICD mapping:
    1. I10 (ICD-10): 85,291 cases
    2. E785 (ICD-10): 68,035 cases
    3. K219 (ICD-10): 47,232 cases
    4. Z87891 (ICD-10): 42,502 cases
    5. I2510 (ICD-10): 40,386 cases


In [None]:
def process_lab_values(labevents_df, admissions_df, important_labs=None):
    """
    Extract summary statistics for important lab values
    
    Parameters:
    - labevents_df: DataFrame with lab events
    - admissions_df: DataFrame with admissions
    - important_labs: List of important lab itemids
    """

    # Define important lab tests (common ones for readmission prediction)
    if important_labs is None:
        important_labs = {
            50868: 'aniongap',      # Anion Gap
            50882: 'bicarbonate',   # Bicarbonate
            50893: 'calcium',       # Calcium
            50902: 'chloride',      # Chloride
            50912: 'creatinine',    # Creatinine
            50931: 'glucose',       # Glucose
            50960: 'magnesium',     # Magnesium
            50970: 'phosphate',     # Phosphate
            50971: 'potassium',     # Potassium
            50983: 'sodium',        # Sodium
            51006: 'urea',          # Urea Nitrogen
            51221: 'hematocrit',    # Hematocrit
            51222: 'hemoglobin',    # Hemoglobin
            51265: 'platelet',      # Platelet Count
            51301: 'wbc'            # White Blood Cells
        }
    
    print(f"Processing {len(important_labs)} important lab values...")
    
    # Filter to important labs only
    important_lab_data = labevents_df[
        labevents_df['itemid'].isin(important_labs.keys())
    ].copy()
    
    # Convert lab values to numeric (handle text values)
    important_lab_data['valuenum'] = pd.to_numeric(
        important_lab_data['valuenum'], errors='coerce'
    )
    
    # Remove extreme outliers (99.5th percentile filter)
    for itemid in important_labs.keys():
        lab_data = important_lab_data[important_lab_data['itemid'] == itemid]
        if len(lab_data) > 0:
            upper_limit = lab_data['valuenum'].quantile(0.995)
            lower_limit = lab_data['valuenum'].quantile(0.005)
            mask = (
                (important_lab_data['itemid'] == itemid) & 
                (important_lab_data['valuenum'] > upper_limit)
            ) | (
                (important_lab_data['itemid'] == itemid) & 
                (important_lab_data['valuenum'] < lower_limit)
            )
            important_lab_data.loc[mask, 'valuenum'] = np.nan
    
    # Aggregate lab values by admission
    lab_features = []
    
    for hadm_id in admissions_df['hadm_id'].unique():
        admission_labs = important_lab_data[
            important_lab_data['hadm_id'] == hadm_id
        ]
        
        row = {'hadm_id': hadm_id}
        
        for itemid, lab_name in important_labs.items():
            lab_values = admission_labs[
                admission_labs['itemid'] == itemid
            ]['valuenum'].dropna()
            
            if len(lab_values) > 0:
                # Create summary statistics
                row[f'{lab_name}_mean'] = lab_values.mean()
                row[f'{lab_name}_min'] = lab_values.min()
                row[f'{lab_name}_max'] = lab_values.max()
                row[f'{lab_name}_count'] = len(lab_values)
            else:
                # Missing values
                row[f'{lab_name}_mean'] = np.nan
                row[f'{lab_name}_min'] = np.nan
                row[f'{lab_name}_max'] = np.nan
                row[f'{lab_name}_count'] = 0
        
        lab_features.append(row)
    
    lab_df = pd.DataFrame(lab_features)
    
    print(f"Created lab features for {len(lab_df)} admissions")
    print(f"Lab features per admission: {len(important_labs) * 4}")
    
    # Show data availability
    for itemid, lab_name in list(important_labs.items())[:5]:
        availability = (lab_df[f'{lab_name}_count'] > 0).mean()
        print(f"    {lab_name}: {availability:.1%} of admissions")
    
    return lab_df

In [None]:
lab_features = process_lab_values(
    mimic_data_subset['labevents'], 
    admissions_with_target,
    important_labs=None
)
# This takes about 60min for 100,000 patients

In [None]:
# Save in case we need to restart because it takes a long time to process
lab_features = pd.read_parquet('lab_features.parquet')
lab_features.head()

Unnamed: 0,hadm_id,aniongap_mean,aniongap_min,aniongap_max,aniongap_count,bicarbonate_mean,bicarbonate_min,bicarbonate_max,bicarbonate_count,calcium_mean,...,hemoglobin_max,hemoglobin_count,platelet_mean,platelet_min,platelet_max,platelet_count,wbc_mean,wbc_min,wbc_max,wbc_count
0,22595853,9.0,9.0,9.0,1,28.0,28.0,28.0,1,7.8,...,12.7,1,71.0,71.0,71.0,1,4.2,4.2,4.2,1
1,22841357,14.0,14.0,14.0,1,25.0,25.0,25.0,1,7.8,...,12.4,1,137.0,137.0,137.0,1,6.6,6.6,6.6,1
2,29079034,11.333333,9.0,14.0,3,24.0,21.0,27.0,3,9.033333,...,11.9,2,94.5,94.0,95.0,2,4.45,4.1,4.8,2
3,25742920,9.333333,6.0,11.0,3,25.0,24.0,26.0,3,8.7,...,12.1,2,120.0,107.0,133.0,2,6.55,5.6,7.5,2
4,22927623,20.0,20.0,20.0,1,22.0,22.0,22.0,1,,...,14.9,1,248.0,248.0,248.0,1,5.0,5.0,5.0,1


In [None]:
def combine_features(demographics_df, diagnosis_df, lab_df):
    """Combine all processed features into final dataset"""
    
    # Start with demographics (includes readmission target)
    final_df = demographics_df.copy()
    
    # Add diagnosis features
    final_df = final_df.merge(diagnosis_df, on='hadm_id', how='left')
        
    # Fill missing diagnosis features with 0 (patient didn't have that diagnosis)
    diag_cols = [col for col in diagnosis_df.columns if col.startswith('diag_')]
    final_df[diag_cols] = final_df[diag_cols].fillna(0)
    
    # Add lab features
    final_df = final_df.merge(lab_df, on='hadm_id', how='left')
    
    # Select final feature columns
    feature_columns = [
        'hadm_id', 'subject_id', 'readmitted',  # IDs and target
        'age_at_admission', 'gender_male', 'los_days',  # Demographics
        'admission_type', 'insurance'  # Admission details
    ]
    
    # Add diagnosis features
    feature_columns.extend([col for col in final_df.columns if col.startswith('diag_')])
    
    # Add lab features
    feature_columns.extend([col for col in final_df.columns if col.endswith('_mean')])
    
    # Filter to available columns
    available_columns = [col for col in feature_columns if col in final_df.columns]
    final_df = final_df[available_columns]
    
    print(f"Final dataset shape: {final_df.shape}")
    print(f"Total features: {len(available_columns) - 3}")  # Minus IDs and target
    
    return final_df


In [26]:
final_dataset = combine_features(demographics, diagnosis_features, lab_features)

Combining all features...
  ✓ Adding diagnosis features...
  ✓ Adding lab features...
  ✓ Final dataset shape: (244240, 53)
  ✓ Total features: 50


In [28]:
final_dataset.head()

Unnamed: 0,hadm_id,subject_id,readmitted,age_at_admission,gender_male,los_days,admission_type,insurance,diag_I10,diag_E785,...,glucose_mean,magnesium_mean,phosphate_mean,potassium_mean,sodium_mean,urea_mean,hematocrit_mean,hemoglobin_mean,platelet_mean,wbc_mean
0,22595853,10000032,0,52,0,0,URGENT,Medicaid,0.0,0.0,...,99.0,1.7,3.6,4.5,137.0,25.0,37.6,12.7,71.0,4.2
1,22841357,10000032,1,52,0,1,EW EMER.,Medicaid,0.0,0.0,...,71.0,1.9,3.3,5.2,126.0,29.0,35.5,12.4,137.0,6.6
2,29079034,10000032,1,52,0,2,EW EMER.,Medicaid,0.0,0.0,...,114.333333,2.133333,2.333333,4.966667,130.666667,32.0,33.45,11.55,94.5,4.45
3,25742920,10000032,0,52,0,1,EW EMER.,Medicaid,0.0,0.0,...,94.666667,2.1,3.633333,5.666667,126.0,31.333333,34.05,11.85,120.0,6.55
4,22927623,10000117,0,48,0,0,EU OBSERVATION,Medicaid,0.0,0.0,...,85.0,,,3.6,142.0,8.0,44.4,14.9,248.0,5.0


In [32]:
# Final dataset statistics
print(f"Final dataset shape: {final_dataset.shape}")
print(f"Readmission rate: {final_dataset['readmitted'].mean():.2%}")
    
# Show feature categories
feature_types = {
        'Demographics': ['age_at_admission', 'gender_male', 'los_days'],
        'Diagnosis': [col for col in final_dataset.columns if col.startswith('diag_')],
        'Lab Values': [col for col in final_dataset.columns if col.endswith('_mean')]
    }
print("\nFeatures:\n")
for category, features in feature_types.items():
        available_features = [f for f in features if f in final_dataset.columns]
        print(f"{category}: {len(available_features)} features")
    
# Check for missing values
missing_pct = (final_dataset.isnull().sum() / len(final_dataset)) * 100
high_missing = missing_pct[missing_pct > 20]
    
print("\nMissing values analysis:\n")
if len(high_missing) > 0:
        print(f"Features with >20% missing values:")
        for feature, pct in high_missing.items():
            print(f"{feature}: {pct:.1f}% missing")
    
    # Save processed dataset
output_file = 'mimic_readmission_features.csv'
final_dataset.to_csv(output_file, index=False)
print(f"\nSaved processed dataset to: {output_file}")


Final dataset shape: (244240, 53)
Readmission rate: 20.38%

Features:

Demographics: 3 features
Diagnosis: 30 features
Lab Values: 15 features

Missing values analysis:

Features with >20% missing values:
aniongap_mean: 25.4% missing
bicarbonate_mean: 25.4% missing
calcium_mean: 31.7% missing
chloride_mean: 24.9% missing
creatinine_mean: 23.9% missing
glucose_mean: 25.5% missing
magnesium_mean: 30.5% missing
phosphate_mean: 32.2% missing
potassium_mean: 24.5% missing
sodium_mean: 24.8% missing
urea_mean: 24.5% missing
hematocrit_mean: 21.2% missing
hemoglobin_mean: 22.7% missing
platelet_mean: 22.4% missing
wbc_mean: 22.8% missing

Saved processed dataset to: mimic_readmission_features.csv
