# Libraries import

In [1]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt 
import sklearn 

## Import data with pandas

Import data from CSV

In [2]:
training_inputs = pd.read_csv('train_features.csv')
training_labels = pd.read_csv('train_labels.csv')
test_inputs = pd.read_csv('test_features.csv')

Sort data according to the patient ID

In [3]:
training_inputs = training_inputs.sort_values(by=['pid','Time'])
training_labels = training_labels.sort_values(by=['pid'])
test_inputs = test_inputs.sort_values(by=['pid','Time'])

Create a vector containing the column labels of the feature vectors and the label vectors

In [4]:
features = training_inputs.columns
labels = training_labels.columns

## Preprocessing of the features

Transform the 12 observations for each patient in the training set and in the tesing set into a sigle observation by calculating the mean of the values that are not NaN. Next, replace all NaN with zeros.

In [5]:
# training set

train_patients = training_inputs.loc[training_inputs['pid']==1].mean();

for i in range(np.max(training_inputs['pid'])-1):

    current_patient = training_inputs.loc[training_inputs['pid']==i+2].mean()  
    
    print('Train patients:  ' + str(i), end ='\r')

    #Some patient number are missing so we have to assert that the patient exists 
    
    if np.isnan(current_patient.mean()) :
        pass
    else: 
         train_patients = np.vstack((train_patients,current_patient))
            
# testing set

test_patients = test_inputs.loc[test_inputs['pid']==0].mean();


for i in range(np.max(test_inputs['pid'])-1):

    current_patient = test_inputs.loc[test_inputs['pid']==i+2].mean()

    print('Test patients:  ' + str(i), end ='\r')
    
    #Some patient number are missing so we have to assert that the patient exists 
    
    if np.isnan(current_patient.mean()) :
        pass
    else: 
         test_patients = np.vstack((test_patients,current_patient))

Test patients:  316536

Create pandas dataframe for the training and the testing set

In [6]:
# use temporary variables not to mess up the original ones

X_train = pd.DataFrame(train_patients, columns = features)
X_test = pd.DataFrame(test_patients, columns = features)

In [7]:
# Now we can compare the modified data with the original

#k = 8 # k corresponds to the feature we are testing
#j = 1000 # j corresponds to the patient 
#j = 12*j

#test_inputs_old = pd.read_csv('test_features.csv')
#test_inputs_old = test_inputs_old.sort_values(by=['pid','Time'])
#print('Feature is: ' +str(features[k] + '\n'))
#print('Test input is: ' + '\n\n' + str(test_inputs_old[features[k]][j:j+12]) + '\n')
#print('Modified test input is: ' + str(X_test[features[k]][j/12]))

Remove features with more than 85% of NaN

In [8]:
for i in range(features.shape[0]):
    
    # training set
    
    NAN_percentage = np.count_nonzero(np.isnan(train_patients[:,i]))/train_patients.shape[0]
    if NAN_percentage > 0.85:
        X_train = X_train.drop(columns=[features[i]])
        
    # test set
        
    NAN_percentage = np.count_nonzero(np.isnan(test_patients[:,i]))/test_patients.shape[0]
    if NAN_percentage > 0.85:
        X_test = X_test.drop(columns=[features[i]])

Now replace all remaining NaN with zeros

In [9]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

Finally, remove "pid" and "time" columns

In [10]:
X_train = X_train.drop(columns=['pid', 'Time'])
X_test = X_test.drop(columns=['pid', 'Time'])

# Subtask 1

Now that the preprocessing is finished, we can start training our estimators. First, import the necessay libraries.

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

First, create a vector containing all the labels we want to predict

In [12]:
labels_task1 = labels[1:11]

Next, generate the model

In [13]:
clf = make_pipeline(StandardScaler(), SVC(kernel='rbf',probability=True, class_weight='balanced'))

Make the first prediction and initialize the vector that will contains all the predictions, initialize the vector containing the scores

In [14]:
clf.fit(X_train,training_labels[labels_task1[0]]) # fit

probability = clf.predict_proba(X_train)[:,1] # compute probability

task1_predictions = probability # here we will store all the predictions for task 1

Score_task1 = roc_auc_score(training_labels[labels_task1[0]],probability) # here we will store the AUC for every prediction

print('Done 1 label out of ' + str(labels_task1.shape[0]) + ', score is ' + str(Score_task1), end ='\r')

Done 1 label out of 10, score is 0.9311783298054152

Train the SVM and generate the AUC score, repeat this for all labels and store the predictions in the prediction vector

In [None]:
for i in range(labels_task1.shape[0]-1):
    
    clf.fit(X_train,training_labels[labels_task1[i+1]]) # fit
    
    probability = clf.predict_proba(X_train)[:,1] # compute probability
    
    task1_predictions = np.vstack([task1_predictions,probability]) # add to prediction vector
    
    Score_task1 = np.vstack([Score_task1,roc_auc_score(training_labels[labels_task1[i+1]],probability)]) # compute AUC
    
    print('Done ' + str(i+2) + ' labels out of ' + str(labels_task1.shape[0]) + ', score is ' + str(Score_task1[i+1]), end ='\r')

Done 3 labels out of 10, score is [0.80849723]

# Subtask 2

Now we predict the occurrence of sepsis. First create the vector containing the label we want to predict.

In [None]:
labels_task2 = labels[11]

In [None]:
clf.fit(X_train,training_labels[labels_task2]) # fit

task2_predictions = clf.predict_proba(X_train)[:,1] # create prediction vector

Score_task2 = roc_auc_score(training_labels[labels_task2],task2_predictions) # generate AUC

print('Done, score is ' + str(Score_task2), end ='\r')

# Subtask 3

We now want to predict the mean value of the vital sign in the remaining stay. First, let us create the model.

In [None]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error

Create feature vector and label vector

In [None]:
labs = training_labels.columns
labels_task3 = training_labels[labs[12:16]]

First create the model

In [None]:
clf3 = make_pipeline(StandardScaler(), KernelRidge(alpha=0,kernel = 'rbf'))

Make the first prediction, compute the score

In [None]:
clf3.fit(X_train,training_labels[labels_task3[0]]) # fit

task3_predictions = clf3.predict(X_train) # predict

MSE = mean_squared_error(training_labels[labels_task3[0]],task3_predictions,squared = True) # compute error

In [None]:
for i in range(labels_task3.shape[0]-1):
    
    clf3.fit(X_train,training_labels[labels_task3[i+1]]) # fit
    
    prediction = clf3.predict(X_train) # predict
    
    task3_predictions = np.vstack([task3_predictions,prediction]) # add to prediction vector
    
    MSE = np.vstack([MSE,mean_squared_error(training_labels[labels_task3[i+1]],prediction,squared = True)]) # compute AUC
    
    print('Done ' + str(i+2) + ' predictions out of ' + str(labels_task3.shape[0]) + ', score is ' + str(MSE[i+1]), end ='\r')

# Export

Create solution and export to zip

In [None]:
task1_trans = np.transpose(task1_predictions)
Sol = training_labels
# add probabilities of first task

Sol[labels_task1]= task1_trans[:,0:10]

# add probabilities of second task

Sol[labels_task2] = task2_predictions[:]

# add results of third task

Sol[labels_task3] = task3_predictions[:,0:4]

# export to zip file

compression_opts = dict(method='zip',archive_name='Prediction.csv')
Sol.to_csv('Prediction.csv', index=False, float_format='%.3f', compression='zip')

Create ground truth from training labels csv file and export to zip

In [None]:
compression_opts = dict(method='zip',archive_name='Truth.csv')
training_labels.to_csv('Truth.zip', index=False, float_format='%.3f',compression=compression_opts)