## Libraries import

In [14]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt 
import sklearn 

## Import data with pandas

Import data from CSV

In [15]:
training_inputs = pd.read_csv("train_features.csv")
training_labels = pd.read_csv("train_labels.csv")
test_features = pd.read_csv("test_features.csv")

Sort data according to the patient ID

In [16]:
training_inputs = training_inputs.sort_values(by=['pid'])
training_labels = training_labels.sort_values(by=['pid'])
test_features = test_features.sort_values(by = ['pid'])

Create a single label set called _labels clean_

In [17]:
BaseExcess = training_labels['LABEL_BaseExcess']
Fibrinogen = training_labels['LABEL_Fibrinogen']
AST = training_labels['LABEL_AST']
Alkalinephos = training_labels['LABEL_Alkalinephos']
Bilirubin_total = training_labels['LABEL_Bilirubin_total']
Lactate = training_labels['LABEL_Lactate']
TroponinI = training_labels['LABEL_TroponinI']
SaO2 = training_labels['LABEL_SaO2']
Bilirubin_direct = training_labels['LABEL_Bilirubin_direct']
EtCO2 = training_labels['LABEL_EtCO2']
labels_clean = np.array([BaseExcess,Fibrinogen,AST,Alkalinephos,Bilirubin_total,Lactate,TroponinI,SaO2,Bilirubin_direct,EtCO2])

## Preprocessing of the features

Replace NaN with zeros in both training and test features

In [18]:
training_inputs = training_inputs.fillna(0)
test_features = test_features.fillna(0)

In [19]:
#-------We first perform some data imputation with a pre built class from Sklearn ------- 
#The data imputation has to be done for each single patient so that values don't get mixed up
#To do this we use the following loop
#Define an empty array 
inputs = training_inputs
patients = inputs.loc[inputs['pid']==1].mean();
#Since some patients id do not exist the range is conservative
for i in range(np.max(inputs['pid'])-1):
    #First of all we split the data for a single patient
    patient = inputs.loc[inputs['pid']==i+2].mean()
    print(i,end ='\r')
    #Some patient number are missing so we have to assert that the patient exists 
    if np.isnan(patient.mean()) :
        pass
    else: 
         patients = np.vstack((patients,patient))

31656

Repeat the same operation with the test features

In [20]:
#We do the same for test features
inputs = test_features
test_patients = inputs.loc[inputs['pid']==0].mean();
#Since some patients id do not exist the range is conservative
for i in range(np.max(inputs['pid'])-1):
    #First of all we split the data for a single patient
    test_patient = inputs.loc[inputs['pid']==i+2].mean()
    print(i,end ='\r')
    #Some patient number are missing so we have to assert that the patient exists 
    if np.isnan(test_patient.mean()) :
        pass
    else: 
         test_patients = np.vstack((test_patients,test_patient))

31653

Drop PID and Time columns

In [23]:
test_patients = pd.DataFrame(test_patients).drop(columns=[0,1])
patients = pd.DataFrame(patients).drop(columns=[0,1])

# Subtask 1

Now that the preprocessing is finished, we can start training our estimators. First, import the necessay libraries.

In [25]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

Standardize data, then train the SVM and generate the AUC score

In [26]:
clf = make_pipeline(StandardScaler(), SVC(kernel='rbf',probability=True))
clf.fit(patients,labels_clean[1])
roc_auc_score(labels_clean[1],clf.predict_proba(patients)[:,1])

0.884172735760971

In [42]:
print(labels_clean[0].shape)
print(patients.shape)
clf.fit(patients,labels_clean[0])

(18995,)
(18995, 37)


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(kernel='sigmoid', probability=True))])

In [43]:
print(test_features.shape)
predictions = clf.predict_proba(test_patients)

(151968, 37)


Now we read the sample CSV and replace some columns of it with our predictions, then we run the evaluation code to see if we score well.

In [53]:
oursample = pd.read_csv('sample.csv')
oursample['LABEL_BaseExcess'] = predictions[:,0]
oursample

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.653985,0.341,0.597,0.651,0.557,0.745,0.224,0.363,0.506,0.643,0.162,18.796,82.511,96.947,84.12
1,10001,0.794163,0.320,0.451,0.152,0.001,0.525,0.276,0.327,0.316,0.656,0.486,18.796,82.511,96.947,84.12
2,10003,0.835339,0.211,0.348,0.153,0.859,0.446,0.406,0.607,0.757,0.290,0.451,18.796,82.511,96.947,84.12
3,10004,0.961179,0.312,0.733,0.129,0.356,0.367,0.931,0.715,0.434,0.005,0.785,18.796,82.511,96.947,84.12
4,10005,0.532992,0.746,0.587,0.743,0.248,0.330,0.071,0.291,0.399,0.217,0.040,18.796,82.511,96.947,84.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,9989,0.857361,0.541,0.373,0.944,0.562,0.594,0.838,0.938,0.401,0.195,0.647,18.796,82.511,96.947,84.12
12660,9991,0.395515,0.040,0.095,0.667,0.918,0.323,0.784,0.343,0.552,0.047,0.916,18.796,82.511,96.947,84.12
12661,9992,0.379529,0.962,0.967,0.564,0.064,0.545,0.210,0.853,0.429,0.829,0.093,18.796,82.511,96.947,84.12
12662,9994,0.715282,0.540,0.868,0.201,0.259,0.632,0.282,0.810,0.724,0.074,0.936,18.796,82.511,96.947,84.12


(18995,)

0.8867542727235821

In [57]:
oursample.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')
oursample.to_csv('prediction.csv', index=False, float_format='%.3f')
