In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
admissions = pd.read_csv('mimic-iii-clinical-database/ADMISSIONS.csv')
patients = pd.read_csv('mimic-iii-clinical-database/PATIENTS.csv')
icu_stays = pd.read_csv('mimic-iii-clinical-database/ICUSTAYS.csv')

In [3]:
print(admissions.head())
print(patients.head())
print(icu_stays.head())

   row_id  subject_id  hadm_id            admittime            dischtime  \
0   12258       10006   142345  2164-10-23 21:09:00  2164-11-01 17:15:00   
1   12263       10011   105331  2126-08-14 22:32:00  2126-08-28 18:59:00   
2   12265       10013   165520  2125-10-04 23:36:00  2125-10-07 15:13:00   
3   12269       10017   199207  2149-05-26 17:19:00  2149-06-03 18:42:00   
4   12270       10019   177759  2163-05-14 20:43:00  2163-05-15 12:00:00   

             deathtime admission_type         admission_location  \
0                  NaN      EMERGENCY       EMERGENCY ROOM ADMIT   
1  2126-08-28 18:59:00      EMERGENCY  TRANSFER FROM HOSP/EXTRAM   
2  2125-10-07 15:13:00      EMERGENCY  TRANSFER FROM HOSP/EXTRAM   
3                  NaN      EMERGENCY       EMERGENCY ROOM ADMIT   
4  2163-05-15 12:00:00      EMERGENCY  TRANSFER FROM HOSP/EXTRAM   

  discharge_location insurance language  religion marital_status  \
0   HOME HEALTH CARE  Medicare      NaN  CATHOLIC      SEPARATED  

In [4]:
   # Handle missing data in the DataFrame.
admissions = admissions.dropna(axis = 0, how ='any')  

In [5]:
print('Admissions columns:', admissions.columns)
print('Patients columns:', patients.columns)

Admissions columns: Index(['row_id', 'subject_id', 'hadm_id', 'admittime', 'dischtime',
       'deathtime', 'admission_type', 'admission_location',
       'discharge_location', 'insurance', 'language', 'religion',
       'marital_status', 'ethnicity', 'edregtime', 'edouttime', 'diagnosis',
       'hospital_expire_flag', 'has_chartevents_data'],
      dtype='object')
Patients columns: Index(['row_id', 'subject_id', 'gender', 'dob', 'dod', 'dod_hosp', 'dod_ssn',
       'expire_flag'],
      dtype='object')


In [6]:
reference_date = pd.Timestamp.now()
offset = reference_date.year - 1800  # Number of years to subtract to normalize the dates

# Assuming you've loaded your data into a dataframe named `patient_adm`
patient_adm = patients.merge(admissions, on='subject_id')
patient_adm['dob'] = pd.to_datetime(patient_adm['dob'], errors='coerce')

# Apply the offset to normalize the years
patient_adm['normalized_dob'] = patient_adm['dob'].apply(lambda x: x - pd.DateOffset(years=offset))


# Calculate age at the time of admission
patient_adm['age'] = patient_adm['admittime'].dt.year - patient_adm['dob'].dt.year

# Label target variable for LOS > 10 days or not
patient_adm['los_10_days_or_less'] = np.where(patient_adm['los'] <= 10, 1, 0)

OutOfBoundsDatetime: Cannot cast 1654-05-14 00:00:00 to unit='ns' without overflow.

In [None]:
# Normalize numerical features
scaler = StandardScaler()
patient_adm['AGE'] = scaler.fit_transform(patient_adm[['AGE']])

# Encode categorical features
le = LabelEncoder()
patient_adm['GENDER'] = le.fit_transform(patient_adm['GENDER'])

# Select features for prediction and the target variable
X = patient_adm[['AGE', 'GENDER']]  # Example features
y = patient_adm['LOS_10DAYS_OR_LESS']  # Target variable

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)