## Libraries import

In [11]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt 
import sklearn 

## Import data with pandas

Import data from CSV

In [35]:
training_inputs = pd.read_csv('/Users/pietrozullo/Task2Files/train_features.csv')
training_labels = pd.read_csv('/Users/pietrozullo/Task2Files/train_labels.csv')
test_features = pd.read_csv('/Users/pietrozullo/Task2Files/test_features.csv')
#training_inputs[22:50]

Sort data according to the patient ID

In [36]:
training_inputs = training_inputs.sort_values(by=['pid','Time'])
training_labels = training_labels.sort_values(by=['pid'])
test_features = test_features.sort_values(by=['pid','Time'])

Create a single label set called _labels clean_

In [15]:
BaseExcess = training_labels['LABEL_BaseExcess']
Fibrinogen = training_labels['LABEL_Fibrinogen']
AST = training_labels['LABEL_AST']
Alkalinephos = training_labels['LABEL_Alkalinephos']
Bilirubin_total = training_labels['LABEL_Bilirubin_total']
Lactate = training_labels['LABEL_Lactate']
TroponinI = training_labels['LABEL_TroponinI']
SaO2 = training_labels['LABEL_SaO2']
Bilirubin_direct = training_labels['LABEL_Bilirubin_direct']
EtCO2 = training_labels['LABEL_EtCO2']
labels_clean = np.array([BaseExcess,Fibrinogen,AST,Alkalinephos,Bilirubin_total,Lactate,TroponinI,SaO2,Bilirubin_direct,EtCO2])

## Preprocessing of the features

##### First of all we perform what's called data imputation, both on the test and training features

In [46]:
from sklearn.impute import SimpleImputer
train_imputer = SimpleImputer(missing_values = np.nan,strategy = 'median')
train_imputer.fit(training_inputs.iloc[:,2:])
imputed_inputs = train_imputer.transform(training_inputs.iloc[:,2:])
training_inputs.iloc[:,2:] = imputed_inputs
#Imputing test features
test_imputer = SimpleImputer(missing_values = np.nan,strategy = 'median')
test_imputer.fit(test_features.iloc[:,2:])
imputed_test_inputs = test_imputer.transform(test_features.iloc[:,2:])
test_features.iloc[:,2:] = imputed_test_inputs

##### New data imputation strategy, we simply subsitutute the NaN with zeros and use the whole time series to predict. 
##### Before fitting we need to normalize the time so that it goes from 0 to 12

In [48]:
training_inputs.loc[training_inputs['pid'] == 1,'Time'] = training_inputs.loc[training_inputs['pid'] == 1]['Time'] - training_inputs.loc[training_inputs['pid'] == 1]['Time'].min()+1

In [49]:
training_inputs.loc[training_inputs['pid'] == training_labels['pid'][0] ,['Time']]  = training_inputs.loc[training_inputs['pid'] == training_labels['pid'][0],['Time']] - training_inputs.loc[training_inputs['pid'] == training_labels['pid'][0]]['Time'].min()+1
patients = np.array([training_inputs.loc[training_inputs['pid']==training_labels['pid'][0]]]).flatten()[1:]
for pid in training_labels['pid'][1:]:
    """Get the start of the hospital stay and subtract the starting hour to the time interval, 
    #so that each time series starts from one and reaches 12
    #Unfortunately it's a bit messy for speed purposes, you can check if it does work for you by printing the commented stuff"""
    #print('PID',pid)
    #print('before')
    #print(training_inputs.loc[training_inputs['pid'] == pid,['Time']])
    training_inputs.loc[training_inputs['pid'] == pid,['Time']]  = training_inputs.loc[training_inputs['pid'] == pid,['Time']] - training_inputs.loc[training_inputs['pid'] == pid]['Time'].min()+1
    patient = np.array([training_inputs.loc[training_inputs['pid']==pid]]).flatten()[1:]
    patients = np.vstack((patients,patient))
    #print('Patient', patients.shape)
    #print('after')
    #print(training_inputs.loc[training_inputs['pid'] == pid,['Time']])

In [50]:
test_features.loc[test_features['pid'] == pd.unique(test_features['pid'])[0] ,['Time']]  = test_features.loc[test_features['pid'] == pd.unique(test_features['pid'])[0],['Time']] - test_features.loc[test_features['pid'] == pd.unique(test_features['pid'])[0],['Time']].min()+1
test_patients = np.array([test_features.loc[test_features['pid']== pd.unique(test_features['pid'])[0]]]).flatten()[1:]
for pid in pd.unique(test_features['pid'])[1:]:
    """Get the start of the hospital stay and subtract the starting hour to the time interval, 
    #so that each time series starts from one and reaches 12
    #Unfortunately it's a bit messy for speed purposes, you can check if it does work for you by printing the commented stuff"""
    #print('PID',pid)
    #print('before')
    #print(training_inputs.loc[training_inputs['pid'] == pid,['Time']])
    test_features.loc[test_features['pid'] == pid,['Time']]  = test_features.loc[test_features['pid'] == pid,['Time']] - test_features.loc[test_features['pid'] == pid]['Time'].min()+1
    test_patient = np.array([test_features.loc[test_features['pid']==pid]]).flatten()[1:]
    test_patients = np.vstack((test_patients,test_patient))
    #print('Patient', test_patients.shape)
    #print('after')
    #print(training_inputs.loc[training_inputs['pid'] == pid,['Time']])

# Subtask 1

Now that the preprocessing is finished, we can start training our estimators. First, import the necessay libraries.

In [51]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

Generate the model

In [52]:
clf = make_pipeline(StandardScaler(), SVC(kernel='rbf',probability=True, class_weight='balanced'))

Generate the vector containing all the labels we want to predict

In [53]:
labels_to_predict = np.array(['BaseExcess','Fibrinogen','AST','Alkalinephos','Bilirubin_total','Lactate','TroponinI','SaO2','Bilirubin_direct','EtCO2'])

Make the first prediction and initialize the vector that will contains all the predictions, initialize the vector containing the scores

In [54]:
clf.fit(patients,labels_clean[0])
probability = clf.predict_proba(test_patients)[:,1]

task1_predictions = probability

Standardize data, then train the SVM and generate the AUC score, repeat this for all labels and store the predictions in the prediction vector

In [55]:
for i in range(labels_to_predict.shape[0]-1):
    clf.fit(patients,labels_clean[i+1])
    probability = clf.predict_proba(test_patients)[:,1]
    task1_predictions = np.vstack([task1_predictions,probability])
    print('Done ' + str(i+2) + ' labels out of ' + str(labels_to_predict.shape[0]), end ='\r')

Done 10 labels out of 10

## Subtask 2

Now we predict the occurrence of sepsis. First, create the vector containing the true labels.

In [56]:
label_sepsis = training_labels['LABEL_Sepsis']

Next, fit the model and add an entry to the prediction matrix

In [57]:
clf.fit(patients,label_sepsis)
task2_predictions = clf.predict_proba(test_patients)[:,1]

## Subtask 3

We now want to predict the mean value of the vital sign in the remaining stay. First, let us create the model.

In [58]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error

Create feature vector and label vector

In [59]:
labs = training_labels.columns
Y = training_labels[labs[12:16]]

Create the model and fit

In [60]:
clf3 = KernelRidge(alpha=0,kernel = 'rbf')
#Fit the model
clf3.fit(patients,Y)

KernelRidge(alpha=0, kernel='rbf')

Compute predictions

In [61]:
task3_predictions = clf3.predict(test_patients)

## Export data to csv

Create full prediction matrix

In [62]:
task1_trans = np.transpose(task1_predictions)

In [63]:
oursample = pd.read_csv('./task2_k49am2lqi/sample.csv')
oursample.columns

Index(['pid', 'LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
       'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate',
       'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct',
       'LABEL_EtCO2', 'LABEL_Sepsis', 'LABEL_RRate', 'LABEL_ABPm',
       'LABEL_SpO2', 'LABEL_Heartrate'],
      dtype='object')

In [64]:


oursample[['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST','LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate','LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct','LABEL_EtCO2']]= task1_trans[:,0:10]




In [65]:
oursample['LABEL_Sepsis'] = task2_predictions[:]

In [66]:
oursample[['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']] = task3_predictions[:,0:4]

In [67]:
oursample.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')
oursample.to_csv('prediction.csv', index=False, float_format='%.3f')

In [68]:
oursample

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.478431,0.115091,0.349060,0.350369,0.354635,0.411764,0.058742,0.371724,0.038399,0.055780,0.061053,2.161551e-49,1.042070e-48,1.125164e-48,9.865206e-49
1,10001,0.046816,0.039031,0.284682,0.293838,0.296724,0.076350,0.422649,0.073226,0.018610,0.037959,0.070951,2.183744e-25,8.594842e-25,1.132744e-24,1.096543e-24
2,10003,0.063836,0.051096,0.202339,0.198331,0.190575,0.076626,0.157968,0.126015,0.029593,0.044051,0.055294,1.573651e-16,6.197991e-16,8.321713e-16,6.504655e-16
3,10004,0.483899,0.124799,0.358640,0.360644,0.364882,0.421820,0.058602,0.378467,0.038906,0.054948,0.061157,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,10005,0.095706,0.046530,0.380774,0.371333,0.360987,0.103534,0.094818,0.136448,0.021987,0.029626,0.070836,1.524797e-20,5.919295e-20,8.163658e-20,6.339042e-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,9989,0.076340,0.037058,0.211565,0.206287,0.203850,0.059361,0.031035,0.061981,0.052732,0.017675,0.046822,4.542293e-19,2.372573e-18,2.825432e-18,2.396216e-18
12660,9991,0.724852,0.059975,0.147831,0.146320,0.133132,0.530989,0.064196,0.594415,0.026707,0.039624,0.059365,6.826114e-107,2.041900e-106,2.878575e-106,2.630749e-106
12661,9992,0.729424,0.060394,0.145543,0.149813,0.158279,0.264093,0.041292,0.520162,0.029704,0.033695,0.070080,5.412409e-20,2.204931e-19,3.001509e-19,1.947056e-19
12662,9994,0.090280,0.015265,0.474013,0.456428,0.451454,0.143435,0.124654,0.242272,0.047376,0.050167,0.081459,4.502154e-49,2.476185e-48,2.541623e-48,2.083555e-48
