In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import warnings
warnings.filterwarnings("ignore")

## Reading the datasets

In [2]:
admissions = pd.read_csv("ADMISSIONS.csv", usecols=['SUBJECT_ID', 'HADM_ID', "ADMITTIME"])

In [3]:
chartevents = pd.read_csv("CHARTEVENTS.csv", usecols=["SUBJECT_ID", "HADM_ID", "ITEMID", "CHARTTIME", "VALUENUM", "VALUEUOM"])

In [4]:
icustays = pd.read_csv("ICUSTAYS.csv", usecols=["SUBJECT_ID", "HADM_ID", "ICUSTAY_ID", "LOS"])

In [5]:
labevents = pd.read_csv("LABEVENTS.csv", usecols=["SUBJECT_ID", "HADM_ID", "ITEMID", "CHARTTIME", "VALUENUM", "VALUEUOM"])

In [6]:
patients = pd.read_csv("PATIENTS.csv", usecols=["SUBJECT_ID", "GENDER", "DOB"])

## Preprocessing

### Admissions + Patiens

In [7]:
admissions_patients = pd.merge(admissions, patients, on='SUBJECT_ID', how='inner')

In [8]:
dummy_cols = pd.get_dummies(admissions_patients['GENDER'])
admissions_patients = pd.concat([admissions_patients, dummy_cols], axis=1)
admissions_patients = admissions_patients.drop('GENDER', axis=1)

In [9]:
admissions_patients = admissions_patients.rename(columns={'F': 'FEMALE', 'M':'MALE'})

In [10]:
admissions_patients["ADMITTIME"] = pd.to_datetime(admissions_patients["ADMITTIME"])
admissions_patients["DOB"] = pd.to_datetime(admissions_patients["DOB"])

In [11]:
admissions_patients["AGE"] = admissions_patients["ADMITTIME"].dt.year-admissions_patients["DOB"].dt.year

In [12]:
admissions_patients

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DOB,FEMALE,MALE,AGE
0,22,165315,2196-04-09 12:26:00,2131-05-07,1,0,65
1,23,152223,2153-09-03 07:15:00,2082-07-17,0,1,71
2,23,124321,2157-10-18 19:34:00,2082-07-17,0,1,75
3,24,161859,2139-06-06 16:14:00,2100-05-31,0,1,39
4,25,129635,2160-11-02 02:06:00,2101-11-21,0,1,59
...,...,...,...,...,...,...,...
58971,98794,190603,2127-11-07 11:00:00,2049-07-29,0,1,78
58972,98797,105447,2132-12-24 20:06:00,2044-12-27,0,1,88
58973,98800,191113,2131-03-30 21:13:00,2111-11-05,1,0,20
58974,98802,101071,2151-03-05 20:00:00,2067-09-21,1,0,84


In [13]:
admissions_patients = admissions_patients[admissions_patients['AGE'] >= 18]

In [14]:
np.random.seed(123)

distinct_subjects = admissions_patients['SUBJECT_ID'].unique()
half_count = len(distinct_subjects) // 2
random_subjects = np.random.choice(distinct_subjects, size=half_count, replace=False)
admissions_patients = admissions_patients[~admissions_patients['SUBJECT_ID'].isin(random_subjects)]

In [15]:
admissions_patients.SUBJECT_ID.nunique()

19289

### Chartevents

In [16]:
dict_of_items_ce = {"SBP" : [220050], 
                    "DBP" : [220051, 220180, 225310, 8368, 8441, 8555],
                    "BOS" : [220227, 220277, 834, 646],
                    "Temperature_Ce" : [223762, 676],
                    "Temperature_Fe":[223761,678],
                    "HR" : [220045, 211],
                    "RR" : [220210, 224422, 224689, 224690, 614, 615, 618, 651],
                    "CO2" : [220235, 778],
                    "WBC" : [51301],
                    "PH" : [50820]
                    }

In [17]:
chart_lab = pd.concat([chartevents,labevents],axis=0).reset_index(drop=True)

In [18]:
reversed_dict = {item: key for key, value in dict_of_items_ce.items() for item in value}
chart_lab["ITEMID"] = chart_lab["ITEMID"].map(reversed_dict)
chart_lab_temp = chart_lab[chart_lab["ITEMID"].isin(dict_of_items_ce.keys())].pivot(columns="ITEMID", values="VALUENUM")

In [19]:
#reversed_dict = {item: key for key, value in dict_of_items_ce.items() for item in value}
#chart_lab = chart_lab.replace(to_replace=reversed_dict)
#chart_lab_temp = chart_lab[chart_lab["ITEMID"].isin(dict_of_items_ce.keys())].pivot(columns="ITEMID",values="VALUENUM")

In [20]:
bitisik=pd.merge(chart_lab,chart_lab_temp,left_index=True,right_index=True,how="inner")
bitisik=pd.merge(bitisik,admissions_patients,on=["HADM_ID","SUBJECT_ID"])

In [21]:
bitisik = bitisik.groupby(["SUBJECT_ID", "HADM_ID", "CHARTTIME"], as_index=False).first()

In [22]:
bitisik["Temperature_Fe_Ce"]=(bitisik["Temperature_Fe"]-32)/1.8
bitisik["Temperature_Ce"]=bitisik[["Temperature_Ce","Temperature_Fe_Ce"]].sum(axis=1,min_count=1)
bitisik.loc[bitisik.Temperature_Ce > 90, 'Temperature_Ce'] = np.nan
bitisik.loc[bitisik.Temperature_Ce < 20, 'Temperature_Ce'] = np.nan
bitisik=bitisik.drop(["Temperature_Fe","Temperature_Fe_Ce"],axis=1)

In [23]:
icustays = icustays[icustays['LOS'] >= 0.5]

In [24]:
bitisik=pd.merge(bitisik,icustays,on=["HADM_ID","SUBJECT_ID"])

In [25]:
bitisik["CHARTTIME"] = pd.to_datetime(bitisik["CHARTTIME"])

In [26]:
bitisik["period"]=np.ceil(((bitisik["CHARTTIME"]-bitisik["ADMITTIME"]).dt.total_seconds()/3600)/1)
bitisik["period"]=bitisik["period"].replace({"-0.0":"1"}).astype(int)

In [27]:
bitisik.drop(["ITEMID", "VALUEUOM"], axis=1, inplace=True)

In [28]:
bitisik=bitisik[bitisik.period>=0]

In [29]:
avg=bitisik.groupby(["SUBJECT_ID","HADM_ID","ICUSTAY_ID","period"],as_index=False)\
        ['SBP', 'DBP', 'BOS', 'Temperature_Ce', 'HR', 'RR', 'CO2', 'WBC', 'PH',"AGE"].agg(['min', 'max'])\
            .groupby(["SUBJECT_ID","HADM_ID","ICUSTAY_ID"]).apply(lambda x: x.ffill().fillna(x.mean()))\
                .groupby(["SUBJECT_ID"]).apply(lambda x: x.ffill().fillna(x.mean()))\
                    .fillna(bitisik.mean()).drop(('AGE', 'max'),axis=1)

In [30]:
avg

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,SBP,SBP,DBP,DBP,BOS,BOS,Temperature_Ce,Temperature_Ce,HR,HR,RR,RR,CO2,CO2,WBC,WBC,PH,PH,AGE
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,min,max,min,max,min,max,min,max,min,max,min,max,min,max,min,max,min,max,min
SUBJECT_ID,HADM_ID,ICUSTAY_ID,period,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
11,194540.0,229441,0,120.931223,120.931223,64.043538,64.043538,96.655172,96.655172,37.222222,37.222222,81.0,90.0,16.0,19.0,42.149575,42.149575,9.933333,9.933333,7.52,7.52,50
11,194540.0,229441,1,120.931223,120.931223,64.043538,64.043538,96.000000,96.000000,36.888890,36.888890,86.0,86.0,17.0,17.0,42.149575,42.149575,9.300000,9.300000,7.52,7.52,50
11,194540.0,229441,2,120.931223,120.931223,64.043538,64.043538,97.000000,97.000000,36.888890,36.888890,90.0,90.0,20.0,20.0,42.149575,42.149575,9.300000,9.300000,7.52,7.52,50
11,194540.0,229441,3,120.931223,120.931223,64.043538,64.043538,96.000000,96.000000,36.888890,36.888890,95.0,95.0,18.0,18.0,42.149575,42.149575,9.300000,9.300000,7.52,7.52,50
11,194540.0,229441,4,120.931223,120.931223,64.043538,64.043538,96.000000,96.000000,36.888890,36.888890,97.0,97.0,20.0,20.0,42.149575,42.149575,9.300000,9.300000,7.52,7.52,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99999,113369.0,246512,51,120.931223,120.931223,66.000000,66.000000,100.000000,100.000000,36.666667,36.666667,83.0,83.0,20.0,20.0,42.149575,42.149575,9.300000,9.300000,7.39,7.39,63
99999,113369.0,246512,52,120.931223,120.931223,64.000000,64.000000,99.000000,99.000000,36.666667,36.666667,78.0,78.0,16.0,16.0,42.149575,42.149575,9.300000,9.300000,7.39,7.39,63
99999,113369.0,246512,53,120.931223,120.931223,65.000000,65.000000,99.000000,99.000000,37.277778,37.277778,82.0,82.0,17.0,17.0,42.149575,42.149575,9.300000,9.300000,7.39,7.39,63
99999,113369.0,246512,54,120.931223,120.931223,61.000000,61.000000,100.000000,100.000000,37.277778,37.277778,86.0,86.0,15.0,15.0,42.149575,42.149575,9.300000,9.300000,7.39,7.39,63


In [31]:
yislem=avg.copy()

In [32]:
yislem["Temperature_cond"]=np.where(yislem[("Temperature_Ce","min")]<36,1,np.where(yislem[("Temperature_Ce","max")]>38,1,0))

In [33]:
yislem["HR_cond"]=np.where(yislem[("HR","max")]>90,1,0)

In [34]:
yislem["RR_cond"]=np.where(yislem[("RR","max")]>20,1,0)

In [35]:
yislem["WBC_cond"]=np.where(yislem[("WBC", "max")]>12,1,np.where(yislem[("WBC","min")]<4,1,0))

In [36]:
yislem["sepsis_sum"]=yislem[["Temperature_cond","WBC_cond","HR_cond","RR_cond"]].sum(axis=1)

In [37]:
yislem["Sepsis_label"]=np.where(yislem["sepsis_sum"]>1,1,0)

In [38]:
sepsis_basladi_mi=np.where(yislem.groupby(["SUBJECT_ID","HADM_ID","ICUSTAY_ID"])["Sepsis_label"].rolling(5).sum().shift(-4).ffill()>=5,1,0)

In [39]:
yislem["sepsis_basladi_mi"]=sepsis_basladi_mi

In [40]:
rolling_window = 5
# Function to apply rolling rows as columns to specific columns
def rolling_rows_as_columns(group):
    columns_to_roll = avg.columns.to_list()  # Specify the columns to apply the rolling rows transformation
    for i in range(1, rolling_window + 1):
        for col in columns_to_roll:
            group[f'{col}_Rolling_{i}'] = group[col].shift(i)
    return group
# Apply the function after grouping the data
df_grouped = avg.groupby(["SUBJECT_ID","HADM_ID","ICUSTAY_ID"]).apply(rolling_rows_as_columns)

In [41]:
df_grouped.dropna(inplace=True)

In [63]:
ysutun=pd.concat([yislem.reset_index().groupby(["SUBJECT_ID","HADM_ID","ICUSTAY_ID"])["sepsis_basladi_mi"].shift(-11),yislem.index.to_frame().reset_index(drop=True)],axis=1)

In [64]:
ysutun=ysutun.set_index(["SUBJECT_ID","HADM_ID","ICUSTAY_ID","period"])

In [65]:
sonhal=pd.merge(df_grouped,ysutun,left_index=True,right_index=True,how="left").dropna()

In [66]:
y=sonhal.sepsis_basladi_mi

In [67]:
sonhal.to_pickle('sonhal_sepsis12hr.pkl')

liste = [19, 20,21,22,23,24,25,45,46,47,48,49,50,51,71,72,73,74,75,76,77,97,98,99,100,101,102,103,123,124,125,126,127,128,129,149,150,151,152,153,154,155]
sonhal.drop(sonhal.columns[liste], axis=1, inplace=True)

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 157)
sonhal

In [68]:
from sklearn.model_selection import train_test_split
# Veri split
X_train, X_test, y_train, y_test = train_test_split(sonhal.iloc[:,0:114], y, test_size=0.2, random_state=42)

In [69]:
x0_train=X_train.iloc[:,0:19]
x1_train=X_train.iloc[:,19:38]
x2_train=X_train.iloc[:,38:57]
x3_train=X_train.iloc[:,57:76]
x4_train=X_train.iloc[:,76:95]
x5_train=X_train.iloc[:,95:114]

x0_test=X_test.iloc[:,0:19]
x1_test=X_test.iloc[:,19:38]
x2_test=X_test.iloc[:,38:57]
x3_test=X_test.iloc[:,57:76]
x4_test=X_test.iloc[:,76:95]
x5_test=X_test.iloc[:,95:114]

X_train=np.array([x0_train,x1_train,x2_train,x3_train,x4_train,x5_train])
X_test=np.array([x0_test,x1_test,x2_test,x3_test,x4_test,x5_test])

In [70]:
X_train = np.transpose(X_train, (1, 0, 2))
X_test = np.transpose(X_test, (1, 0, 2))

In [71]:
np.save('X_train12hr.npy', X_train)
np.save('X_test12hr.npy', X_test)
np.save('y_train12hr.npy', y_train)
np.save('y_test12hr.npy', y_test)

np.save('X_train.npy', X_train)

np.save('X_test.npy', X_test)

np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

In [51]:
# a = np.load('X_train.npy')

sonhal.to_pickle('sonhal.pkl')

loaded_dataframe = pd.read_pickle('sonhal.pkl')

np.save('X_train_sepsis.npy', X_train)
np.save('X_test_sepsis.npy', X_test)
np.save('y_train_sepsis.npy', y_train)
np.save('y_test_sepsis.npy', y_test)
sonhal.to_pickle('sonhal_sepsis.pkl')