In [1]:
import pandas as pd
import numpy as np

In [2]:
file_path = '/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/chartevents_demographics.csv'

# Load the CSV into a DataFrame
demographics = pd.read_csv(file_path)

# Display the first few rows to confirm it loaded correctly
print(demographics.head())

   subject_id   stay_id            charttime  itemid  value  valuenum
0    10002428  38875437  2156-04-12 16:24:00  226512   43.0      43.0
1    10004235  34100191  2196-02-24 14:39:00  226512  127.0     127.0
2    10004235  34100191  2196-02-24 14:39:00  226730  183.0     183.0
3    10004401  39699336  2144-06-05 19:46:00  226512   86.1      86.1
4    10004401  39699336  2144-06-05 19:46:00  226730  170.0     170.0


In [3]:
demographics = demographics.drop_duplicates(subset=['subject_id', 'stay_id', 'itemid'])
demographics.head()

Unnamed: 0,subject_id,stay_id,charttime,itemid,value,valuenum
0,10002428,38875437,2156-04-12 16:24:00,226512,43.0,43.0
1,10004235,34100191,2196-02-24 14:39:00,226512,127.0,127.0
2,10004235,34100191,2196-02-24 14:39:00,226730,183.0,183.0
3,10004401,39699336,2144-06-05 19:46:00,226512,86.1,86.1
4,10004401,39699336,2144-06-05 19:46:00,226730,170.0,170.0


In [4]:
label_map = {
    226512: 'Weight',
    226730: 'Height',
    226228: 'Gender'
}

# Add 'label' column based on itemid
demographics['label'] = demographics['itemid'].map(label_map)

In [5]:
# Check for outliers in the dataframe

summary_stats = demographics.groupby('itemid')['valuenum'].agg(
    min_value='min',
    max_value='max',
    median_value='median'
).reset_index()

summary_stats

Unnamed: 0,itemid,min_value,max_value,median_value
0,226512,1.0,710.0,80.0
1,226730,0.0,445.0,170.0


In [6]:
# Define valid ranges for specific itemids
valid_ranges = {
    226512: (30, 200),
    226730: (70, 220)
}

to_filter = demographics[demographics['itemid'].isin(valid_ranges.keys())].copy()

# Apply physiological range filtering to the filtered subset
def in_valid_range(row):
    low, high = valid_ranges[row['itemid']]
    return low <= row['valuenum'] <= high

demographics = to_filter[to_filter.apply(in_valid_range, axis=1)]

In [7]:
# Normalise each value using MinMaxScalar

from sklearn.preprocessing import MinMaxScaler

# Create an empty list to hold the normalized data
normalized_frames = []

# Loop through each itemid and normalize valuenum within its group
for itemid, group in demographics.groupby('itemid'):
    scaler = MinMaxScaler()
    group = group.copy()  # avoid SettingWithCopyWarning
    group['valuenum_normalized'] = scaler.fit_transform(group[['valuenum']])
    normalized_frames.append(group)

# Concatenate all normalized groups back together
demographics = pd.concat(normalized_frames, ignore_index=True)

demographics.head()

Unnamed: 0,subject_id,stay_id,charttime,itemid,value,valuenum,label,valuenum_normalized
0,10002428,38875437,2156-04-12 16:24:00,226512,43.0,43.0,Weight,0.064967
1,10004235,34100191,2196-02-24 14:39:00,226512,127.0,127.0,Weight,0.574985
2,10004401,39699336,2144-06-05 19:46:00,226512,86.1,86.1,Weight,0.326655
3,10004733,39635619,2174-12-04 11:28:00,226512,112.5,112.5,Weight,0.486946
4,10005817,31316840,2135-01-03 21:55:00,226512,88.8,88.8,Weight,0.343048


In [8]:
demographics1 = demographics[demographics['label'].isin(['Height', 'Weight'])].copy()

# Pivot so each subject_id/stay_id has columns for Height and Weight
demographics1 = demographics1.pivot_table(
    index=['subject_id', 'stay_id'],
    columns='label',
    values='valuenum_normalized',
    aggfunc='first'     # in case of duplicates, take the first value
).reset_index()

# Flatten column index
demographics1.columns.name = None

In [9]:
demographics1.head()

Unnamed: 0,subject_id,stay_id,Height,Weight
0,10002428,38875437,,0.064967
1,10004235,34100191,0.849624,0.574985
2,10004401,39699336,0.75188,0.326655
3,10004733,39635619,0.827068,0.486946
4,10005817,31316840,0.789474,0.343048


In [10]:
import os

demographics1.to_csv("/Users/mmurali1/Documents/Imperial/MIMICIV_analysis/demographics_cleaned.csv", index=False)