<h1>IS4242 Group Project</h1>

<b>Import necessary libraries</b>

In [1]:
import pandas as pd
import numpy as np
import os

'''
Ensure that you are in the root folder of all the fold folders and target files
read_text(fold_name):
    fold_name: this is the name of the fold you want to read ALL patient files of. It will be read into a 2 dimensional
    list. If you would like to retrieve just the first patient instead, you will need to change the line 
    "txt_all.extend(txt[1:])" to "txt_all.append(txt[1:])" and you will be to use "read_text(fold1.txt)[0]" to retrieve
    the relevant patient's data
read_ans(file_name):
    file_name: this is the name of the file you want to read ALL targets of. It will be read into a 2 dimensional
    list. To retrieve the first patient's target: read_ans(ans.csv)[0]
put_single_into_dataframe(txt): This functions takes in 2 dimensional list ie the output of read_text(fold1.txt) 
put_multiple_into_dataframe(txt): Multiple is for using it with the output of read_text after you wanted to change it to append
'''

In [2]:
def read_text(fold_name):
    txt_all = list()
    for f in os.listdir(fold_name): # for each file in the directory
        with open(os.path.join(fold_name, f), 'r') as fp: # open each file
            txt = fp.readlines() # read inside the file
        recordid = txt[1].rstrip('\n').split(',')[-1] # get recordid
        txt = [[int(recordid)] + t.rstrip('\n').split(',') for t in txt] # preface each row with the recordid as all patients are 1 file
        txt_all.extend(txt[1:]) # skip the parameter list
    return txt_all

def read_ans(file_name):
    txt_all = list()
    with open(file_name, 'r') as fp: # opens the csv file
        txt = fp.readlines() 
    for i in range(1, len(txt)): # similar to above read_text
        record_id, length_of_stay, hospital_death = txt[i].rstrip('\n').split(',')
        txt_all.append([record_id, length_of_stay, hospital_death])
    return txt_all

def put_multiple_into_dataframe(txt_all):
    df = pd.DataFrame()
    for i in txt_all:
        df2 = pd.DataFrame(i, columns=['recordid', 'time', 'parameter', 'value'])
        df = df.append(df2, ignore_index=True)
    return df

def put_single_into_dataframe(txt_all):
    df = pd.DataFrame(txt_all, columns=['recordid', 'time', 'parameter', 'value'])
    return df

In [3]:
# Reading features
fold1 = put_single_into_dataframe(read_text("./Project_Data/Fold1"))
fold2 = put_single_into_dataframe(read_text("./Project_Data/Fold2"))
fold3 = put_single_into_dataframe(read_text("./Project_Data/Fold3"))
fold4 = put_single_into_dataframe(read_text("./Project_Data/Fold4"))
df_feat = fold1.copy()
df_feat = df_feat.append(fold2)
df_feat = df_feat.append(fold3)
df_feat = df_feat.append(fold4)
df_feat.head()

Unnamed: 0,recordid,time,parameter,value
0,132539,00:00,RecordID,132539
1,132539,00:00,Age,54
2,132539,00:00,Gender,0
3,132539,00:00,Height,-1
4,132539,00:00,ICUType,4


In [4]:
# Reading Target
df_target = pd.DataFrame(read_ans('./Project_Data/Fold1_Outcomes.csv'), columns=['recordid', 'days_in_hospital', 'mortality'])
df_target.head()

Unnamed: 0,recordid,days_in_hospital,mortality
0,132539,5,0
1,132540,8,0
2,132541,19,0
3,132543,9,0
4,132545,4,0


In [5]:

bin_feat = ['MechVent']
num_feat = ['Albumin', 'ALP', 'ALT', 'AST', 'Bilirubin', 'BUN', 'Cholesterol',
           'Creatinine', 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT',
           'HR', 'K', 'Lactate', 'Mg', 'MAP', 'NA', 'NIDiasABP', 'NIMAP',
           'NISysABP', 'PaCO2', 'PaO2', 'pH', 'Platelets', 'RespRate', 'SaO2',
           'SysABP', 'Temp', 'Tropl', 'TropT', 'Urine', 'WBC', 'Weight']

print("Number of record ids:", len(df_feat['recordid'].unique()))
unique_count = df_feat['parameter'].value_counts()/3000
print(unique_count)

Number of record ids: 4000
HR             76.179333
MAP            48.586667
SysABP         48.550000
DiasABP        48.522333
Urine          45.639333
Weight         43.055000
NISysABP       32.777000
NIDiasABP      32.736667
NIMAP          32.290333
Temp           28.801667
GCS            20.521000
RespRate       18.350333
FiO2           10.796667
MechVent       10.381333
pH              8.118333
PaCO2           7.764333
PaO2            7.756000
HCT             6.090333
K               4.813333
Platelets       4.701333
Creatinine      4.661000
BUN             4.638667
HCO3            4.537667
Mg              4.530000
Na              4.523333
Glucose         4.340333
WBC             4.303333
SaO2            2.728333
Lactate         2.674667
Age             1.333333
ICUType         1.333333
Gender          1.333333
Height          1.333333
RecordID        1.333333
Bilirubin       1.063667
AST             1.060667
ALT             1.059000
ALP             1.030667
Albumin         0.78533

<h2>Analysis of Features</h2>
<p>The data above shows the average number of times a variable observed per patient. Based on the data above and the feature description we classify the features into these categories:
<ul>
    <li>General Descriptors (static data) that are collected when the patient is admitted to the ICU. Weight is not included as weight are measured multiple times as a time series data. Each of the descriptors will be included as a feature into the model.</li>
    <li>Rare features: measured on average less than one time per patient (less than 1.0). We use the <u>existence</u> of these measurements for each patient as a feature.</li>
    <li>Features that measured often or more that one time per patient (more than 1.0). Calculate the hourly average of each measurements and put them into 48 columns. <i>Example, average HR on the first hour to HR_1, average HR on the second hour to HR_2, and so on.</i></li>
</ul>
</p>


In [6]:
stat_feat = ['Age', 'Gender', 'Height', 'ICUType', 'RecordID'] #General Descriptors
rare_feat = []
nor_feat = []
for index, value in unique_count.items():
    if value < 1.0:
        rare_feat.append(index)
    elif index not in stat_feat:
        nor_feat.append(index)
print("Rare features", rare_feat)
print("Normal features", nor_feat)

Rare features ['Albumin', 'TroponinT', 'TroponinI', 'Cholesterol']
Normal features ['HR', 'MAP', 'SysABP', 'DiasABP', 'Urine', 'Weight', 'NISysABP', 'NIDiasABP', 'NIMAP', 'Temp', 'GCS', 'RespRate', 'FiO2', 'MechVent', 'pH', 'PaCO2', 'PaO2', 'HCT', 'K', 'Platelets', 'Creatinine', 'BUN', 'HCO3', 'Mg', 'Na', 'Glucose', 'WBC', 'SaO2', 'Lactate', 'Bilirubin', 'AST', 'ALT', 'ALP']
