# Constructing data set with sepsis features

In [1]:
import numpy as np
from os import listdir

path = "../training/"
paths = [path + "p0" + str(10000+i)[1:] + ".psv" for i in range(1, 5001)]
data = np.array([np.loadtxt(open(path + file), delimiter='|', skiprows=1) for file in paths])
keys = open(paths[0]).readline().rstrip().split('|')

def delete_columns_nan(data,keys):
    df = {}
    for i, column in enumerate(data.T):
        if not np.isnan(column).all():
            df[keys[i]] = column
    return df


def replace_nan_by_value(data, value=None):
    for j, patient in enumerate(data):
        for key in patient.keys():
            if value == 'normal':
                for i in range(len(patient[key])):
                    p = patient[key].copy()[~np.isnan(patient[key])]
                    if np.isnan(patient[key][i]):
                        if value == 'mean':
                            data[j][key][i] = np.mean(p)
                        if value == 'normal':
                            data[j][key][i] = np.random.normal(np.mean(p), np.std(p))
            else:
                patient[key] = np.nan_to_num(patient[key])
    return data

data_aux = []
for patient in data:
    data_aux.append(delete_columns_nan(patient, keys))

# Count non-EMPTY entries throughout patients
df = {}
for key in keys:
    df[key] = 0

for patient in data_aux:
    for key in patient.keys():
        df[key] += 1
    
    
data_new = replace_nan_by_value(data_aux, None)

data_aux = []
sepsis_keys = {}
df = {}

for key in keys:
    df[key] = 0
    sepsis_keys[key] = 0

for patient in data:
    data_aux.append(delete_columns_nan(patient, keys))
    if np.any(data_aux[-1]['SepsisLabel']) == 1:
        for key in data_aux[-1].keys():
            sepsis_keys[key] += 1

keys_s = []
for key in sepsis_keys.keys():
    if sepsis_keys[key] == np.max(list(sepsis_keys.values())):
        keys_s.append(key)
print(keys_s)

['HR', 'O2Sat', 'SBP', 'MAP', 'DBP', 'Age', 'Gender', 'HospAdmTime', 'ICULOS', 'SepsisLabel']


# New Data set

In [2]:
new_dataset = []
for patient in data_new:
    if len(keys_s & patient.keys()) == len(keys_s):
        new_dataset.append([patient[d] for d in keys_s])
new_dataset = np.hstack(new_dataset).T

# Begin Classification

In [3]:
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.model_selection import LeaveOneOut, KFold
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier as knn

In [5]:
x = new_dataset[:,:-1]
y = new_dataset[:,-1]
loo = LeaveOneOut()
kf = KFold(n_splits=10)

for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
#     random_forest = rf(n_estimators=10)
#     random_forest = random_forest.fit(X_train, y_train)
#     results = random_forest.predict(X_test)
#     print(f1_score(y_test, results))
    
#     svm_c = svm.SVC(gamma='auto')
#     svm_c = svm_c.fit(X_train, y_train)
#     results = svm_c.predict(X_test)
#     print(f1_score(y_test, results))
    
    knear = knn()
    knear = knear.fit(X_train, y_train)
    results = knear.predict(X_test)
    print(f1_score(y_test, results))

TRAIN: [ 18795  18796  18797 ... 187945 187946 187947] TEST: [    0     1     2 ... 18792 18793 18794]
0.10505836575875487
TRAIN: [     0      1      2 ... 187945 187946 187947] TEST: [18795 18796 18797 ... 37587 37588 37589]
0.06179775280898876
TRAIN: [     0      1      2 ... 187945 187946 187947] TEST: [37590 37591 37592 ... 56382 56383 56384]
0.14661654135338348
TRAIN: [     0      1      2 ... 187945 187946 187947] TEST: [56385 56386 56387 ... 75177 75178 75179]
0.02252252252252252
TRAIN: [     0      1      2 ... 187945 187946 187947] TEST: [75180 75181 75182 ... 93972 93973 93974]
0.010666666666666666
TRAIN: [     0      1      2 ... 187945 187946 187947] TEST: [ 93975  93976  93977 ... 112767 112768 112769]
0.031168831168831165
TRAIN: [     0      1      2 ... 187945 187946 187947] TEST: [112770 112771 112772 ... 131562 131563 131564]
0.011869436201780416
TRAIN: [     0      1      2 ... 187945 187946 187947] TEST: [131565 131566 131567 ... 150357 150358 150359]
0.0
TRAIN: [   