## Libraries import

In [1]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt 
import sklearn 

## Import data with pandas

Import data from CSV

In [2]:
training_inputs = pd.read_csv("train_features.csv")
training_labels = pd.read_csv("train_labels.csv")
test_features = pd.read_csv("test_features.csv")

Sort data according to the patient ID

In [3]:
training_inputs = training_inputs.sort_values(by=['pid'])
training_labels = training_labels.sort_values(by=['pid'])
test_features = test_features.sort_values(by = ['pid'])

Create a single label set called _labels clean_

In [4]:
BaseExcess = training_labels['LABEL_BaseExcess']
Fibrinogen = training_labels['LABEL_Fibrinogen']
AST = training_labels['LABEL_AST']
Alkalinephos = training_labels['LABEL_Alkalinephos']
Bilirubin_total = training_labels['LABEL_Bilirubin_total']
Lactate = training_labels['LABEL_Lactate']
TroponinI = training_labels['LABEL_TroponinI']
SaO2 = training_labels['LABEL_SaO2']
Bilirubin_direct = training_labels['LABEL_Bilirubin_direct']
EtCO2 = training_labels['LABEL_EtCO2']
labels_clean = np.array([BaseExcess,Fibrinogen,AST,Alkalinephos,Bilirubin_total,Lactate,TroponinI,SaO2,Bilirubin_direct,EtCO2])

## Preprocessing of the features

Replace NaN with zeros in both training and test features

In [5]:
training_inputs = training_inputs.fillna(0)
test_features = test_features.fillna(0)

In [6]:
#-------We first perform some data imputation with a pre built class from Sklearn ------- 
#The data imputation has to be done for each single patient so that values don't get mixed up
#To do this we use the following loop
#Define an empty array 
inputs = training_inputs
patients = inputs.loc[inputs['pid']==1].mean();
#Since some patients id do not exist the range is conservative
for i in range(np.max(inputs['pid'])-1):
    #First of all we split the data for a single patient
    patient = inputs.loc[inputs['pid']==i+2].mean()
    print(i,end ='\r')
    #Some patient number are missing so we have to assert that the patient exists 
    if np.isnan(patient.mean()) :
        pass
    else: 
         patients = np.vstack((patients,patient))

31656

Repeat the same operation with the test features

In [7]:
#We do the same for test features
inputs = test_features
test_patients = inputs.loc[inputs['pid']==0].mean();
#Since some patients id do not exist the range is conservative
for i in range(np.max(inputs['pid'])-1):
    #First of all we split the data for a single patient
    test_patient = inputs.loc[inputs['pid']==i+2].mean()
    print(i,end ='\r')
    #Some patient number are missing so we have to assert that the patient exists 
    if np.isnan(test_patient.mean()) :
        pass
    else: 
         test_patients = np.vstack((test_patients,test_patient))

31653

Drop PID and Time columns

In [8]:
test_patients = pd.DataFrame(test_patients).drop(columns=[0,1])
patients = pd.DataFrame(patients).drop(columns=[0,1])

# Subtask 1

Now that the preprocessing is finished, we can start training our estimators. First, import the necessay libraries.

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

Generate the model

In [10]:
clf = make_pipeline(StandardScaler(), SVC(kernel='rbf',probability=True, class_weight='balanced'))

Generate the vector containing all the labels we want to predict

In [11]:
labels_to_predict = np.array(['BaseExcess','Fibrinogen','AST','Alkalinephos','Bilirubin_total','Lactate','TroponinI','SaO2','Bilirubin_direct','EtCO2'])

Make the first prediction and initialize the vector that will contains all the predictions, initialize the vector containing the scores

In [12]:
clf.fit(patients,labels_clean[0])
probability = clf.predict_proba(test_patients)[:,1]

task1_predictions = probability

Standardize data, then train the SVM and generate the AUC score, repeat this for all labels and store the predictions in the prediction vector

In [13]:
for i in range(labels_to_predict.shape[0]-1):
    clf.fit(patients,labels_clean[i+1])
    probability = clf.predict_proba(test_patients)[:,1]
    task1_predictions = np.vstack([task1_predictions,probability])
    print('Done ' + str(i+2) + ' labels out of ' + str(labels_to_predict.shape[0]), end ='\r')

Done 10 labels out of 10

## Subtask 2

Now we predict the occurrence of sepsis. First, create the vector containing the true labels.

In [14]:
label_sepsis = training_labels['LABEL_Sepsis']

Next, fit the model and add an entry to the prediction matrix

In [15]:
clf.fit(patients,label_sepsis)
task2_predictions = clf.predict_proba(test_patients)[:,1]

## Subtask 3

We now want to predict the mean value of the vital sign in the remaining stay. First, let us create the model.

In [16]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error

Create feature vector and label vector

In [17]:
labs = training_labels.columns
Y = training_labels[labs[12:16]]

Create the model and fit

In [18]:
clf3 = KernelRidge(alpha=0,kernel = 'rbf')
#Fit the model
clf3.fit(patients,Y)

KernelRidge(alpha=0, kernel='rbf')

Compute predictions

In [20]:
task3_predictions = clf3.predict(test_patients)

## Export data to csv

Create full prediction matrix

In [50]:
task1_trans = np.transpose(task1_predictions).shape

(12664, 4)

In [71]:
oursample = pd.read_csv('sample.csv')
oursample.columns

Index(['pid', 'LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
       'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate',
       'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct',
       'LABEL_EtCO2', 'LABEL_Sepsis', 'LABEL_RRate', 'LABEL_ABPm',
       'LABEL_SpO2', 'LABEL_Heartrate'],
      dtype='object')

In [72]:
oursample[['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST','LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate','LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct','LABEL_EtCO2']] = task1_trans[:,0:10]

In [73]:
oursample['LABEL_Sepsis'] = task2_predictions[:]

In [74]:
oursample[['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']] = task3_predictions[:,0:4]

In [75]:
oursample.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')
oursample.to_csv('prediction.csv', index=False, float_format='%.3f')
