# Final Project 
## Complete Machine Learning Pipeline for MIMIC Classification

### Mariajose Argote


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, TargetEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [4]:
# Generate a random seed for reproducibility
# np.random.seed() has a range of [0, 2**32 - 1] for the seed value
print(np.random.randint(0, 2**30))

SEED = 244459055

np.random.seed(SEED)
#540921260


540921260


## Import MIMIC Data

In [9]:
# Import data

train = pd.read_csv('MIMIC III dataset HEF/mimic_train_HEF.csv')
test= pd.read_csv('MIMIC III dataset HEF/mimic_test_HEF.csv')


# Display first few rows of the dataset

print(train.info())

train.describe()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20885 entries, 0 to 20884
Data columns (total 44 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   HOSPITAL_EXPIRE_FLAG  20885 non-null  int64  
 1   subject_id            20885 non-null  int64  
 2   hadm_id               20885 non-null  int64  
 3   icustay_id            20885 non-null  int64  
 4   HeartRate_Min         18698 non-null  float64
 5   HeartRate_Max         18698 non-null  float64
 6   HeartRate_Mean        18698 non-null  float64
 7   SysBP_Min             18677 non-null  float64
 8   SysBP_Max             18677 non-null  float64
 9   SysBP_Mean            18677 non-null  float64
 10  DiasBP_Min            18676 non-null  float64
 11  DiasBP_Max            18676 non-null  float64
 12  DiasBP_Mean           18676 non-null  float64
 13  MeanBP_Min            18699 non-null  float64
 14  MeanBP_Max            18699 non-null  float64
 15  MeanBP_Mean        

Unnamed: 0,HOSPITAL_EXPIRE_FLAG,subject_id,hadm_id,icustay_id,HeartRate_Min,HeartRate_Max,HeartRate_Mean,SysBP_Min,SysBP_Max,SysBP_Mean,...,TempC_Max,TempC_Mean,SpO2_Min,SpO2_Max,SpO2_Mean,Glucose_Min,Glucose_Max,Glucose_Mean,Diff,LOS
count,20885.0,20885.0,20885.0,20885.0,18698.0,18698.0,18698.0,18677.0,18677.0,18677.0,...,18388.0,18388.0,18682.0,18682.0,18682.0,20632.0,20632.0,20632.0,20885.0,20885.0
mean,0.112282,58950.496098,150082.402298,250202.495523,69.705904,105.239801,85.18025,91.110564,150.72592,119.145423,...,37.428781,36.751717,91.007494,99.555883,96.866685,106.781975,182.129604,138.856428,-51617.06983,3.701046
std,0.31572,25299.439535,28898.479845,28909.806302,14.86984,20.922613,15.318208,17.532534,23.833793,16.701503,...,0.799897,0.603476,7.39939,1.159792,2.333108,35.178811,92.665603,44.933145,10686.395846,5.175721
min,0.0,23.0,100001.0,200001.0,2.0,39.0,34.714286,5.0,46.0,46.0,...,30.8,30.666667,1.0,57.0,47.666667,2.0,42.0,42.0,-72740.27444,0.0566
25%,0.0,41132.0,125157.0,225153.0,60.0,90.0,74.272727,81.0,134.0,107.1,...,36.944444,36.388889,90.0,100.0,95.8125,86.0,126.0,110.75,-60864.45411,1.1654
50%,0.0,60441.0,150152.0,250452.0,69.0,103.0,84.131183,90.0,148.0,116.901961,...,37.333333,36.731481,92.0,100.0,97.142857,102.0,161.0,128.666667,-51561.70346,2.0208
75%,0.0,80286.0,175017.0,275303.0,79.0,118.0,95.185588,101.0,164.0,129.465116,...,37.888889,37.092593,94.0,100.0,98.346154,121.0,206.0,154.578571,-42327.56003,3.9158
max,1.0,99999.0,199999.0,299998.0,141.0,280.0,163.875,181.0,323.0,202.172414,...,42.0,40.238333,100.0,100.0,100.0,563.0,2440.0,771.9,-32157.49458,101.739


In [16]:
print(test.info())
test.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5221 entries, 0 to 5220
Data columns (total 39 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   subject_id      5221 non-null   int64  
 1   hadm_id         5221 non-null   int64  
 2   icustay_id      5221 non-null   int64  
 3   HeartRate_Min   4676 non-null   float64
 4   HeartRate_Max   4676 non-null   float64
 5   HeartRate_Mean  4676 non-null   float64
 6   SysBP_Min       4670 non-null   float64
 7   SysBP_Max       4670 non-null   float64
 8   SysBP_Mean      4670 non-null   float64
 9   DiasBP_Min      4669 non-null   float64
 10  DiasBP_Max      4669 non-null   float64
 11  DiasBP_Mean     4669 non-null   float64
 12  MeanBP_Min      4674 non-null   float64
 13  MeanBP_Max      4674 non-null   float64
 14  MeanBP_Mean     4674 non-null   float64
 15  RespRate_Min    4675 non-null   float64
 16  RespRate_Max    4675 non-null   float64
 17  RespRate_Mean   4675 non-null   f

Unnamed: 0,subject_id,hadm_id,icustay_id,HeartRate_Min,HeartRate_Max,HeartRate_Mean,SysBP_Min,SysBP_Max,SysBP_Mean,DiasBP_Min,...,TempC_Min,TempC_Max,TempC_Mean,SpO2_Min,SpO2_Max,SpO2_Mean,Glucose_Min,Glucose_Max,Glucose_Mean,Diff
count,5221.0,5221.0,5221.0,4676.0,4676.0,4676.0,4670.0,4670.0,4670.0,4669.0,...,4583.0,4583.0,4583.0,4670.0,4670.0,4670.0,5163.0,5163.0,5163.0,5221.0
mean,58870.629573,150248.186746,249925.293239,69.942686,105.27994,85.221128,90.862392,150.547966,118.763143,44.142643,...,36.041879,37.44147,36.755418,90.82848,99.553319,96.857538,107.823494,376.578346,163.521697,-51606.334303
std,25320.956194,28679.538155,28763.030681,15.013524,21.247956,15.484126,17.52914,23.57545,16.490767,11.695481,...,0.762967,0.797153,0.595941,7.660437,1.516541,2.406957,34.312518,13915.51071,1739.829727,10666.881712
min,107.0,100033.0,200011.0,1.0,42.0,36.0,10.0,62.0,51.75,6.0,...,22.777778,32.555556,32.222222,1.0,42.0,42.0,15.0,32.0,32.0,-72639.6203
25%,41055.0,125246.0,225118.0,60.0,90.0,74.035131,81.0,134.0,106.752976,37.0,...,35.666667,36.944444,36.409722,90.0,100.0,95.840227,87.0,126.0,111.5,-60840.73009
50%,60057.0,150492.0,249759.0,69.0,104.0,83.957428,90.0,148.0,116.807407,44.0,...,36.111111,37.333333,36.738095,92.0,100.0,97.134848,102.0,160.0,129.0,-51612.81206
75%,80313.0,174983.0,274576.0,79.0,118.0,95.477143,101.0,164.0,128.862844,51.0,...,36.5,37.888889,37.091705,94.0,100.0,98.333333,122.0,205.0,155.060976,-42552.06572
max,99992.0,199967.0,299979.0,143.0,220.0,155.571429,162.0,290.0,195.25,97.0,...,39.388889,41.111111,39.527778,100.0,100.0,100.0,374.0,999999.0,125110.125,-32176.62848


In [22]:
# Check train/test column consistency
target = "HOSPITAL_EXPIRE_FLAG"
leaky_columns = ['LOS', 'DOD', 'DISCHTIME', 'DEATHTIME'] 

# Columns to use as features in train
feature_columns = [c for c in train.columns if c != target and c not in leaky_columns]

# 1) Check that test has all feature columns
missing_in_test = set(feature_columns) - set(test.columns)
extra_in_test   = set(test.columns) - set(feature_columns)
print("Missing in test:", missing_in_test)
print("Extra in test  :", extra_in_test)

# 2) Reorder test columns to match train's feature order
test = test[feature_columns]


#checking type consistency
shared = set(train.columns) & set(test.columns)
dtype_diff = {
    col: (train[col].dtype, test[col].dtype)
    for col in shared
    if train[col].dtype != test[col].dtype
}
print("Columns with different dtypes:", dtype_diff)

Missing in test: set()
Extra in test  : set()
Columns with different dtypes: {}
