# Constructing data set with sepsis features

In [1]:
import numpy as np
from os import listdir

path = "../training/"
paths = [path + "p0" + str(10000+i)[1:] + ".psv" for i in range(1, 5001)]
data = np.array([np.loadtxt(open(path + file), delimiter='|', skiprows=1) for file in paths])
keys = open(paths[0]).readline().rstrip().split('|')

def delete_columns_nan(data,keys):
    df = {}
    for i, column in enumerate(data.T):
        if not np.isnan(column).all():
            df[keys[i]] = column
    return df


def replace_nan_by_value(data, value=None):
    for j, patient in enumerate(data):
        for key in patient.keys():
            if value == 'normal':
                for i in range(len(patient[key])):
                    p = patient[key].copy()[~np.isnan(patient[key])]
                    if np.isnan(patient[key][i]):
                        if value == 'mean':
                            data[j][key][i] = np.mean(p)
                        if value == 'normal':
                            data[j][key][i] = np.random.normal(np.mean(p), np.std(p))
            else:
                patient[key] = np.nan_to_num(patient[key])
    return data

data_aux = []
for patient in data:
    data_aux.append(delete_columns_nan(patient, keys))

# Count non-EMPTY entries throughout patients
df = {}
for key in keys:
    df[key] = 0

for patient in data_aux:
    for key in patient.keys():
        df[key] += 1
    
    
data_new = replace_nan_by_value(data_aux, None)

data_aux = []
sepsis_keys = {}
df = {}

for key in keys:
    df[key] = 0
    sepsis_keys[key] = 0

for patient in data:
    data_aux.append(delete_columns_nan(patient, keys))
    if np.any(data_aux[-1]['SepsisLabel']) == 1:
        for key in data_aux[-1].keys():
            sepsis_keys[key] += 1

keys_s = []
for key in sepsis_keys.keys():
    if sepsis_keys[key] == np.max(list(sepsis_keys.values())):
        keys_s.append(key)
print(keys_s)

['HR', 'O2Sat', 'SBP', 'MAP', 'DBP', 'Age', 'Gender', 'HospAdmTime', 'ICULOS', 'SepsisLabel']


# New Data set

In [2]:
new_dataset = []
for patient in data_new:
    if len(keys_s & patient.keys()) == len(keys_s):
        new_dataset.append([patient[d] for d in keys_s])
new_dataset = np.hstack(new_dataset).T

# Begin Classification

In [94]:
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.model_selection import LeaveOneOut, KFold, ShuffleSplit
from sklearn.metrics import f1_score, accuracy_score
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier

scaler = MinMaxScaler()
x = new_dataset[:,:-1]
scaler = scaler.fit(x)
x = scaler.transform(x)

y = new_dataset[:,-1]
print(len(np.where(y==1)[0]))
# x_under_sampled = [i for i in x[np.where(y!=1)[0]]]
# x_under_sampled.append(x[np.where(y==1)[0]])

2623


# SMOTE

Smote is an oversampling technique used to balance unbalanced data. In this case it oversamples the class with less number of samples.<br>
We could also undersample the class with the highest number of samples.

In [95]:
from imblearn.over_sampling import SMOTE

x, y = SMOTE().fit_resample(x, y)

# Principal Component Analysis
PCA allows to find components that carry most of the information and to reduce data dimensionality, hopefully, improving the classifiers' results.

In [96]:
from sklearn.decomposition import PCA

print(x.shape)

pca = PCA(.95)
x = pca.fit_transform(x)

print(x.shape)

(370650, 9)
(370650, 5)


In [56]:
loo = LeaveOneOut()
kf = KFold(n_splits=10)

for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    random_forest = rf(n_estimators=1)
    random_forest = random_forest.fit(X_train, y_train)
    results = random_forest.predict(X_test)
    print(f1_score(y_test, results))
    
#     svm_c = svm.OneClassSVM(gamma='auto', verbose=True, max_iter=1)
#     svm_c = svm_c.fit(X_train, y_train)
#     results = svm_c.predict(X_test)
#     print(f1_score(y_test, results, average='micro'))
    
#     knear = knn()
#     knear = knear.fit(X_train, y_train)
#     results = knear.predict(X_test)
#     print(f1_score(y_test, results))

TRAIN: [ 37065  37066  37067 ... 370647 370648 370649] TEST: [    0     1     2 ... 37062 37063 37064]
0.20842017507294708
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [37065 37066 37067 ... 74127 74128 74129]
0.20734962020119072
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [ 74130  74131  74132 ... 111192 111193 111194]
0.22850729517396184
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [111195 111196 111197 ... 148257 148258 148259]
0.20278330019880716
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [148260 148261 148262 ... 185322 185323 185324]
0.19854401058901391
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [185325 185326 185327 ... 222387 222388 222389]
0.9663705205414107
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [222390 222391 222392 ... 259452 259453 259454]
0.9669012612361508
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [259455 259456 259457 ... 296517 296518 296519]
0.

In [57]:
print(accuracy_score(y_test, results))

0.9388641575610415


In [97]:
rs = ShuffleSplit(n_splits=10, train_size=0.5, test_size=.25, random_state=0)

for train_index, test_index in rs.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
#     random_forest = rf(n_estimators=1000)
#     random_forest = random_forest.fit(X_train, y_train)
#     results = random_forest.predict(X_test)
#     print(f1_score(y_test, results))
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(x.shape[1], 1), random_state=42)
    clf = clf.fit(X_train, y_train)
    results = clf.predict(X_test)
    print(f1_score(y_test, results))
    
#     svm_c = svm.OneClassSVM(gamma='auto', verbose=True, max_iter=1)
#     svm_c = svm_c.fit(X_train, y_train)
#     results = svm_c.predict(X_test)
#     print(f1_score(y_test, results, average='micro'))
    

TRAIN: [223808  82199  59933 ... 157660 357773  90437] TEST: [122930 264761  35758 ... 342925 303203   5343]
0.602131991855312
TRAIN: [162475  38981 107818 ... 345121 252108 194569] TEST: [  4849 290585 194297 ...  33959 294669 220344]
0.5963226126638663
TRAIN: [  3562   7524 366351 ...  32224  57594  94626] TEST: [ 92360  63935 201089 ... 362877 112286 169202]
0.5790783801833527
TRAIN: [276697 222825  54815 ...  58307 227993  66758] TEST: [318370 195007 190635 ...  95257  14434  48999]
0.5821330568937131
TRAIN: [140706  73401 106162 ... 206222 218703   2706] TEST: [ 44606 206898 283930 ...  47980 177878 226414]
0.5800201767286959
TRAIN: [288933 363863 272781 ... 332276  63240  53216] TEST: [ 78492 156968 253257 ...  38538 133873   4323]
0.5766790155155339
TRAIN: [184789   7438 281403 ... 260475 369106  43280] TEST: [356809 370315 105677 ... 214439  65198 152947]
0.5899968604342262
TRAIN: [333591 138164 315998 ...  29126 133346 322599] TEST: [244871 213903 116223 ... 114231 245832  275

In [98]:
print(accuracy_score(y_test, results))

0.6269924349524622
