# Libraries import

In [15]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt 
import sklearn 

## Import data with pandas

Import data from CSV

In [25]:
training_inputs = pd.read_csv('train_features.csv')
training_labels = pd.read_csv('train_labels.csv')
test_inputs = pd.read_csv('test_features.csv')

Create a vector containing the column labels of the feature vectors and the label vectors

In [26]:
features = training_inputs.columns
labels = training_labels.columns

## Preprocessing of the features

Transform the 12 observations for each patient in the training set and in the tesing set into a sigle observation by calculating the mean of the values that are not NaN. Next, replace all NaN with zeros.

In [27]:
# training set

train_patients = training_inputs[0:12].mean()

for i in range(int(training_inputs.shape[0]/12)-1):

    current_patient = training_inputs[(i+1)*12:(i+1)*12+12].mean()  
    
    print('Train patients:  ' + str(i), end ='\r')

    #Some patient number are missing so we have to assert that the patient exists 
    
    if np.isnan(current_patient.mean()) :
        pass
    else: 
         train_patients = np.vstack((train_patients,current_patient))
            
# testing set

test_patients = test_inputs[0:12].mean()


for i in range(int(test_inputs.shape[0]/12)-1):

    current_patient = test_inputs[(i+1)*12:(i+1)*12+12].mean()  

    print('Test patients:  ' + str(i), end ='\r')
    
    #Some patient number are missing so we have to assert that the patient exists 
    
    if np.isnan(current_patient.mean()) :
        pass
    else: 
         test_patients = np.vstack((test_patients,current_patient))

Test patients:  126623

Create pandas dataframe for the training and the testing set

In [36]:
# use temporary variables not to mess up the original ones

X_train = pd.DataFrame(train_patients, columns = features)
X_test = pd.DataFrame(test_patients, columns = features)

Remove features with more than 85% of NaN

In [37]:
for i in range(features.shape[0]):
    
    NAN_percentage = np.count_nonzero(np.isnan(test_patients[:,i]))/test_patients.shape[0]
    if NAN_percentage > 0.85:
        X_train = X_train.drop(columns=[features[i]])
        X_test = X_test.drop(columns=[features[i]])

Now replace all remaining NaN with zeros

In [38]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

Finally, remove "pid" and "time" columns

In [40]:
X_train = X_train.drop(columns=['pid', 'Time'])
X_test = X_test.drop(columns=['pid', 'Time'])

# Subtask 1

Now that the preprocessing is finished, we can start training our estimators. First, import the necessay libraries.

In [41]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

First, create a vector containing all the labels we want to predict

In [42]:
labels_task1 = labels[1:11]

Next, generate the model

In [43]:
clf = make_pipeline(StandardScaler(), SVC(kernel='rbf',probability=True, class_weight='balanced'))

Make the first prediction and initialize the vector that will contains all the predictions, initialize the vector containing the scores

In [44]:
clf.fit(X_train,training_labels[labels_task1[0]]) # fit

probability = clf.predict_proba(X_test)[:,1] # compute probability

task1_predictions = probability # here we will store all the predictions for task 1

print('Done 1 label out of ' + str(labels_task1.shape[0]), end ='\r')

Done 1 label out of 10

Train the SVM and generate the AUC score, repeat this for all labels and store the predictions in the prediction vector

In [45]:
for i in range(labels_task1.shape[0]-1):
    
    clf.fit(X_train,training_labels[labels_task1[i+1]]) # fit
    
    probability = clf.predict_proba(X_test)[:,1] # compute probability
    
    task1_predictions = np.vstack([task1_predictions,probability]) # add to prediction vector
    
    print('Done ' + str(i+2) + ' labels out of ' + str(labels_task1.shape[0]), end ='\r')

Done 10 labels out of 10

# Subtask 2

Now we predict the occurrence of sepsis. First create the vector containing the label we want to predict.

In [46]:
labels_task2 = labels[11]

In [47]:
clf.fit(X_train,training_labels[labels_task2]) # fit

task2_predictions = clf.predict_proba(X_test)[:,1] # create prediction vector

print('Done')

Done


# Subtask 3

We now want to predict the mean value of the vital sign in the remaining stay. First, let us create the model.

In [48]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error

Create feature vector and label vector

In [49]:
labels_task3 = labels[12:16]

First create the model

In [50]:
clf3 = make_pipeline(StandardScaler(), KernelRidge(alpha=1.0,kernel = 'rbf'))

Make the first prediction, compute the score

In [51]:
clf3.fit(X_train,training_labels[labels_task3[0]]) # fit

task3_predictions = clf3.predict(X_test) # predict

print('Done 1 predictions out of ' + str(labels_task3.shape[0]))

Done 1 predictions out of 4


In [52]:
for i in range(labels_task3.shape[0]-1):
    
    clf3.fit(X_train,training_labels[labels_task3[i+1]]) # fit
    
    prediction = clf3.predict(X_test) # predict
    
    task3_predictions = np.vstack([task3_predictions,prediction]) # add to prediction vector
    
    print('Done ' + str(i+2) + ' predictions out of ' + str(labels_task3.shape[0]), end ='\r')

Done 4 predictions out of 4

# Export

Create solution and export to zip

In [54]:
# transpose solutions to task 1 and 3 to match dimensions

task1_trans = np.transpose(task1_predictions)
task3_trans = np.transpose(task3_predictions)

Prediction = pd.read_csv('Sample.zip') # initialize by using the sample submission

# add probabilities of first task

Prediction[labels_task1]= task1_trans[:,0:10]

# add probabilities of second task

Prediction[labels_task2] = task2_predictions[:]

# add results of third task

Prediction[labels_task3] = task3_trans[:,0:4]

# export to zip file

compression_opts = dict(method='zip',archive_name='Pred.csv')
Prediction.to_csv('Pred.zip', index=False, float_format='%.3f', compression=compression_opts)