# Constructing data set with sepsis features

In [8]:
import numpy as np
from os import listdir

path = "../training/"
paths = [path + "p0" + str(10000+i)[1:] + ".psv" for i in range(1, 5001)]
data = np.array([np.loadtxt(open(path + file), delimiter='|', skiprows=1) for file in paths])
keys = open(paths[0]).readline().rstrip().split('|')

def delete_columns_nan(data,keys):
    df = {}
    for i, column in enumerate(data.T):
        if not np.isnan(column).all():
            df[keys[i]] = column
    return df


def replace_nan_by_value(data, value=None):
    for j, patient in enumerate(data):
        for key in patient.keys():
            if value is not None:
                for i in range(len(patient[key])):
                    p = patient[key].copy()[~np.isnan(patient[key])]
                    if np.isnan(patient[key][i]):
                        if value == 'mean':
                            data[j][key][i] = np.mean(p)
                        if value == 'normal':
                            data[j][key][i] = np.random.normal(np.mean(p), np.std(p))
            else:
                patient[key] = np.nan_to_num(patient[key])
    return data

data_aux = []
for patient in data:
    data_aux.append(delete_columns_nan(patient, keys))

# Count non-EMPTY entries throughout patients
df = {}
for key in keys:
    df[key] = 0

for patient in data_aux:
    for key in patient.keys():
        df[key] += 1
    
    
data_new = replace_nan_by_value(data_aux, 'normal')

data_aux = []
sepsis_keys = {}
df = {}

for key in keys:
    df[key] = 0
    sepsis_keys[key] = 0

for patient in data:
    data_aux.append(delete_columns_nan(patient, keys))
    if np.any(data_aux[-1]['SepsisLabel']) == 1:
        for key in data_aux[-1].keys():
            sepsis_keys[key] += 1

keys_s = []
for key in sepsis_keys.keys():
    if sepsis_keys[key] == np.max(list(sepsis_keys.values())):
        keys_s.append(key)
print(keys_s)

['HR', 'O2Sat', 'SBP', 'MAP', 'DBP', 'Age', 'Gender', 'HospAdmTime', 'ICULOS', 'SepsisLabel']


# New Data set

In [4]:
new_dataset = []
for patient in data_new:
    if len(keys_s & patient.keys()) == len(keys_s):
        new_dataset.append([patient[d] for d in keys_s])
new_dataset = np.hstack(new_dataset).T

# Begin Classification

In [5]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.model_selection import LeaveOneOut, KFold, ShuffleSplit
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import Perceptron
from xgboost import XGBClassifier

# scaler = MinMaxScaler()
x = np.array(new_dataset)[:,:-1]
# scaler = scaler.fit(x)
# x = scaler.transform(x)

y = new_dataset[:,-1]
print(len(np.where(y==1)[0]))
# x_under_sampled = [i for i in x[np.where(y!=1)[0]]]
# x_under_sampled.append(x[np.where(y==1)[0]])

2623


# SMOTE

Smote is an oversampling technique used to balance unbalanced data. In this case it oversamples the class with less number of samples.<br>
We could also undersample the class with the highest number of samples.

Smote should only be applied in the training set not to compromise the classification task. As it generates new data that might be similar to the input data, it can lead to overfit.

In [None]:
# from imblearn.over_sampling import SMOTE

# # Possibly cause overfit
# x, y = SMOTE(sampling_strategy=.5).fit_resample(x, y)

# Principal Component Analysis
PCA allows to find components that carry most of the information and to reduce data dimensionality, hopefully, improving the classifiers' results.

For Tree-based classifiers, such as, decision tree, random forest and adaboost, it should not be used, because those algorithm perform feature selection automatically.

In [27]:
from sklearn.decomposition import PCA

print(x.shape)

pca = PCA(.95)
x = pca.fit_transform(x)

print(x.shape)

(187948, 9)
(187948, 2)


In [98]:
loo = LeaveOneOut()
kf = KFold(n_splits=10)

for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
#     random_forest = RandomForestClassifier(n_estimators=1)
#     random_forest = random_forest.fit(X_train, y_train)
#     results = random_forest.predict(X_test)
#     print(f1_score(y_test, results))
       
#     svm_c = svm.OneClassSVM(gamma='auto', verbose=True, max_iter=1)
#     svm_c = svm_c.fit(X_train, y_train)
#     results = svm_c.predict(X_test)
#     print(f1_score(y_test, results, average='micro'))
    
    knear = knn()
    knear = knear.fit(X_train, y_train)
    results = knear.predict(X_test)
    print(f1_score(y_test, results))

TRAIN: [ 37065  37066  37067 ... 370647 370648 370649] TEST: [    0     1     2 ... 37062 37063 37064]
0.22906927326816826
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [37065 37066 37067 ... 74127 74128 74129]
0.23592017738359206
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [ 74130  74131  74132 ... 111192 111193 111194]
0.2498255408234473
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [111195 111196 111197 ... 148257 148258 148259]
0.2239813736903376
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [148260 148261 148262 ... 185322 185323 185324]
0.23757455268389666
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [185325 185326 185327 ... 222387 222388 222389]
0.9946911896956863
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [222390 222391 222392 ... 259452 259453 259454]
0.9976201422505883
TRAIN: [     0      1      2 ... 370647 370648 370649] TEST: [259455 259456 259457 ... 296517 296518 296519]
0.99

In [99]:
print(accuracy_score(y_test, results))

0.9947659517064616


In [7]:
# Used to avoid overfit
rs = ShuffleSplit(n_splits=10, train_size=.9, test_size=.1, random_state=0)
auc = []

for train_index, test_index in rs.split(x):
    
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    scaler = MinMaxScaler()
    x = new_dataset[:,:-1]
    scaler = scaler.fit(X_train)
    x = scaler.transform(x)
    X_train, X_test = x[train_index], x[test_index]

    y = new_dataset[:,-1]
    X_train, y_train = SMOTE(sampling_strategy=1).fit_resample(X_train, y_train)
    
#     random_forest = RandomForestClassifier(n_estimators=10)
#     random_forest = random_forest.fit(X_train, y_train)
#     results = random_forest.predict(X_test)

    adaboost = AdaBoostClassifier(RandomForestClassifier(5), algorithm='SAMME')
    adaboost = adaboost.fit(X_train, y_train)
    results = adaboost.predict(X_test)
    
#     clf = XGBClassifier()
#     clf = clf.fit(X_train,y_train)
#     results = clf.predict(X_test)
    
#     clf = GradientBoostingClassifier()
#     clf = clf.fit(X_train,y_train)
#     results = clf.predict(X_test)
    
#     clf = BaggingClassifier(base_estimator=RandomForestClassifier(100))
#     clf = clf.fit(X_train, y_train)
#     results = clf.predict(X_test)
    
#     clf = Perceptron(tol=1e-3, random_state=0)
#     clf = clf.fit(X_train, y_train)
#     results = clf.predict(X_test)

#     naive_bayes = GaussianNB()
#     naive_bayes = naive_bayes.fit(X_train, y_train)
#     results = naive_bayes.predict(X_test)
    
#     qda = QuadraticDiscriminantAnalysis()
#     qda = qda.fit(X_train, y_train)
#     results = qda.predict(X_test)
    
#     gauss = GaussianNB()
#     gauss = gauss.fit(X_train, y_train)
#     results = gauss.predict(X_test)
    
# x.shape[1], 2
#     clf = MLPClassifier(solver='adam', activation='logistic', alpha=1e-2, hidden_layer_sizes=(x.shape[1], 1), random_state=42)
#     clf = clf.fit(X_train, y_train)
#     results = clf.predict(X_test)
    
#     svm_c = svm.OneClassSVM(gamma='auto', verbose=1, max_iter=10)
#     svm_c = svm_c.fit(X_train, y_train)
#     results = svm_c.predict(X_test)

    auc.append(roc_auc_score(y_test, results))
    print(auc[-1])

TRAIN: [153458  57532  23934 ... 117952 173685  43567] TEST: [ 65507 172667  51337 ...  85652  36256  70268]
0.760894136722743
TRAIN: [ 88697 148863   7626 ...  19176  22375  52939] TEST: [ 68376  41372 132309 ...  44056  65354   4883]
0.7640856018782918
TRAIN: [ 76054 102214  92980 ...  67694  37528  79594] TEST: [ 23282 134278 120480 ...   7679 120643 155353]


KeyboardInterrupt: 

In [None]:
print("Mean: " + str(100*np.mean(auc)) + " +- " + str(100*np.std(auc)))