In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [3]:
IDs = []
for file in os.listdir('/Users/manny/Downloads/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/set_a'):
    if file.endswith('.txt'):
        IDs.append(file)
        

In [4]:
time_dfs = []
static_dfs = []

# use IDs list to make dataframes for each txt file
for i in IDs:
    # use read_csv to make dataframe
    df = pd.read_csv('{}'.format(i))
    
    # make one dataframe with just temporal values
    df['RecordID'] = df.at[0, 'Value']
    df1 = df[6:]
    
    # make another dataframe with just static values
    d = df[:6]
    values = d['Value'].values    
    df2 = pd.DataFrame([values], columns=['RecordID', 'Age', 'Gender', 'Height', 'ICUType', 'Weight'])
    
    # append lists with dataframes
    time_dfs.append(df1)
    static_dfs.append(df2)
    
# Concatenate all created dataframes into one
df_time = pd.concat(time_dfs).reset_index(drop=True)
df_static = pd.concat(static_dfs).reset_index(drop=True)

In [24]:
df_static.shape

(4000, 6)

In [25]:
df_time.shape

(1733980, 4)

In [7]:
# create function to replace missing/erroneous values in columns

def data_preprocessing(series, ok_range):
    col = series.copy()
    good_values = []
    indexes = []
    
    # append lists with good values and bad indexes
    for i in series:
        # if the metric is between an accepted range, use to calc mean
        # ex: 7'5" is max height, so if i is < 7'5" it is added to list
        if i >= ok_range[0] and i <= ok_range[1]:
            good_values.append(i)
            
        # if out of range, replace with -5    
        else:
            x = col[col == i].index
            indexes.append(x)
    
    # replace series values with -5
    for x in indexes:
        series[x] = -5
        
    # now replace bad values with mean for column
    mean = sum(good_values) / len(good_values)
    mean = round(mean, 1)
    new_series = series.replace(-5, mean)
    return new_series
            
            

In [28]:
# create function to replace missing/erroneous values in columns

def data_preprocessing2(series, ok_range):
    col = series.copy()
    good_values = []
    indexes = []
    
    # append lists with good values and bad indexes
    for i in series:
        # if the metric is between an accepted range, use to calc mean
        # ex: 7'5" is max height, so if i is < 7'5" it is added to list
        if i >= ok_range[0] and i <= ok_range[1]:
            good_values.append(i)
            
        # if out of range, replace with -5    
        else:
            x = col[col == i].index
            indexes.append(x)
    
    # replace series values with -5
    for x in indexes:
        series[x] = -5
        
    # now replace bad values with mean for column
    new_series = series.replace(-5, np.nan)
    return new_series
            

In [8]:
height_col = data_preprocessing(df_static['Height'], [140, 230])

In [9]:
weight_col = data_preprocessing(df_static['Weight'], [40, 301])

In [29]:
gender_col = data_preprocessing2(df_static['Gender'], [0, 1])

In [30]:
ICU_col = data_preprocessing2(df_static['ICUType'], [1, 4])

In [10]:
df_static['Height'] = height_col

In [11]:
df_static['Weight'] = weight_col

In [27]:
df_static['Gender'] = gender_col

In [32]:
df_static['ICUType'] = ICU_col

In [37]:
df_static = df_static.iloc[:, :6]

In [52]:
df_static.shape

(4000, 6)

In [39]:
df_static.to_csv('pred_mor_static.csv')

In [15]:
def time_to_hours(series):
    series = series.copy()
    lols = []
    
    for i in series:
        j = int(i[:2])
        k = int(i[-2:])
        l = j + (k / 60)
        lols.append(round(l, 2))    
    return lols
            
            

In [16]:
hours = pd.Series(time_to_hours(df_time['Time']))


In [18]:
df_time['Time'] = hours

In [40]:
df_time.head()

Unnamed: 0,Time,Parameter,Value,RecordID
0,1.33,GCS,15.0,132592.0
1,1.33,HR,112.0,132592.0
2,1.33,NIDiasABP,43.0,132592.0
3,1.33,NIMAP,68.67,132592.0
4,1.33,NISysABP,120.0,132592.0


In [53]:
df_time.Parameter.value_counts

<bound method IndexOpsMixin.value_counts of 0                GCS
1                 HR
2          NIDiasABP
3              NIMAP
4           NISysABP
             ...    
1733975        Urine
1733976      DiasABP
1733977           HR
1733978          MAP
1733979       SysABP
Name: Parameter, Length: 1733980, dtype: object>

In [20]:
df_time.to_csv('pred_mor_time.csv')

In [41]:
df1 = df_time[df_time['RecordID'] == 132592]

In [50]:
df1l.max()

603.0

In [48]:
print(df1[:50])

    Time   Parameter   Value  RecordID
0   1.33         GCS   15.00  132592.0
1   1.33          HR  112.00  132592.0
2   1.33   NIDiasABP   43.00  132592.0
3   1.33       NIMAP   68.67  132592.0
4   1.33    NISysABP  120.00  132592.0
5   1.33    RespRate   22.00  132592.0
6   1.33        Temp   36.60  132592.0
7   1.33      Weight   71.80  132592.0
8   2.33          HR  113.00  132592.0
9   2.33   NIDiasABP   53.00  132592.0
10  2.33       NIMAP   76.67  132592.0
11  2.33    NISysABP  124.00  132592.0
12  2.33    RespRate   21.00  132592.0
13  2.33       Urine  120.00  132592.0
14  2.33      Weight   71.80  132592.0
15  2.60         BUN   68.00  132592.0
16  2.60  Creatinine    2.30  132592.0
17  2.60     Glucose  603.00  132592.0
18  2.60        HCO3   11.00  132592.0
19  2.60         HCT   25.50  132592.0
20  2.60          Mg    2.80  132592.0
21  2.60   Platelets  287.00  132592.0
22  2.60           K    5.30  132592.0
23  2.60          Na  140.00  132592.0
24  2.60   TroponinT    0

# Preparing Test Set

In [None]:
test_IDs = []
for file in os.listdir('/Users/manny/Downloads/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/Outcomes-a.txt'):
    if file.endswith('.txt'):
        IDs.append(file)

In [54]:
df_test = pd.read_csv('/Users/manny/Downloads/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0/Outcomes-a.txt')

In [55]:
df_test

Unnamed: 0,RecordID,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,132539,6,1,5,-1,0
1,132540,16,8,8,-1,0
2,132541,21,11,19,-1,0
3,132543,7,1,9,575,0
4,132545,17,2,4,918,0
...,...,...,...,...,...,...
3995,142665,19,7,10,336,0
3996,142667,8,2,3,-1,0
3997,142670,8,5,11,-1,0
3998,142671,22,10,8,7,1


In [57]:
outcomes = df_test.drop(['SAPS-I', 'SOFA', 'Length_of_stay', 'Survival'], axis=1)

In [58]:
outcomes

Unnamed: 0,RecordID,In-hospital_death
0,132539,0
1,132540,0
2,132541,0
3,132543,0
4,132545,0
...,...,...
3995,142665,0
3996,142667,0
3997,142670,0
3998,142671,1


In [59]:
outcomes = outcomes.sort_values(by=['RecordID'])

In [60]:
outcomes.to_csv('outcomes_a.csv')