In [1]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt 
import sklearn 
from sklearn.impute import SimpleImputer

# Data Loading

In [2]:
path_to_training_set = "./task2_k49am2lqi/train_features.csv"
path_to_label_set = "./task2_k49am2lqi/train_labels.csv"
training_inputs = pd.read_csv(path_to_training_set)
training_inputs = training_inputs.sort_values(by = ['pid'])
training_labels = pd.read_csv(path_to_label_set)
training_labels = training_labels.sort_values(by = ['pid'])
path_to_test_features = "./task2_k49am2lqi/test_features.csv"
test_features = pd.read_csv(path_to_test_features)
test_features = test_features.sort_values(by = ['pid'])
test_features

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,0,1,39.0,,,,,,,,...,,,,,,,,,,
10,0,11,39.0,,,,,,,,...,,,,,,,,,,
9,0,10,39.0,,,,,36.0,,,...,,100.0,,,,85.0,,,120.0,
8,0,9,39.0,,,,,36.0,,,...,,100.0,,,,90.0,,,121.0,
7,0,8,39.0,,,,,36.0,,,...,,100.0,,,,90.0,,,129.0,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115922,31655,3,23.0,,56.8,,,,11.8,,...,,100.0,,,37.1,125.0,,,,
115921,31655,2,23.0,,,,,,,,...,,100.0,,,,131.0,,6.27,120.0,
115920,31655,1,23.0,,,,,,,,...,,,,,,,,,,
115924,31655,5,23.0,,,,,36.0,,,...,,100.0,,,,119.0,,,120.0,


## Get Labels

In [3]:
BaseExcess = training_labels['LABEL_BaseExcess']
Fibrinogen = training_labels['LABEL_Fibrinogen']
AST = training_labels['LABEL_AST']
Alkalinephos = training_labels['LABEL_Alkalinephos']
Bilirubin_total = training_labels['LABEL_Bilirubin_total']
Lactate = training_labels['LABEL_Lactate']
TroponinI = training_labels['LABEL_TroponinI']
SaO2 = training_labels['LABEL_SaO2']
Bilirubin_direct = training_labels['LABEL_Bilirubin_direct']
EtCO2 = training_labels['LABEL_EtCO2']
labels_clean = np.array([BaseExcess,Fibrinogen,AST,Alkalinephos,Bilirubin_total,Lactate,TroponinI,SaO2,Bilirubin_direct,EtCO2])
labels_clean.shape # To get a single label set 

(10, 18995)

## Data Imputation

##### Firstly, we need to get rid of NaN values, our first try is to set the NaN values to zero

In [4]:
training_inputs = training_inputs.fillna(0)
test_features = test_features.fillna(0)

In [5]:
#-------We first perform some data imputation with a pre built class from Sklearn ------- 
#The data imputation has to be done for each single patient so that values don't get mixed up
#To do this we use the following loop
#Define an empty array 
inputs = training_inputs
patients = inputs.loc[inputs['pid']==1].mean();
#Since some patients id do not exist the range is conservative
for i in range(np.max(inputs['pid'])-1):
    #First of all we split the data for a single patient
    patient = inputs.loc[inputs['pid']==i+2].mean()
    print(i,end ='\r')
    #Some patient number are missing so we have to assert that the patient exists 
    if np.isnan(patient.mean()) :
        pass
    else: 
         patients = np.vstack((patients,patient))

print('Finished Cleaning Input data')
            

Finished Cleaning Input data


In [6]:
#We do the same for test features
inputs = test_features
test_patients = inputs.loc[inputs['pid']==1].mean();
#Since some patients id do not exist the range is conservative
for i in range(np.max(inputs['pid'])-1):
    #First of all we split the data for a single patient
    test_patient = inputs.loc[inputs['pid']==i+2].mean()
    print(i,end ='\r')
    #Some patient number are missing so we have to assert that the patient exists 
    if np.isnan(patient.mean()) :
        pass
    else: 
         test_patients = np.vstack((test_patients,test_patient))

print('Finished Cleaning Test data')

Finished Cleaning Test data


In [7]:
test_features

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,0,1,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
10,0,11,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
9,0,10,39.0,0.0,0.0,0.0,0.0,36.0,0.0,0.0,...,0.0,100.0,0.0,0.0,0.0,85.0,0.0,0.00,120.0,0.0
8,0,9,39.0,0.0,0.0,0.0,0.0,36.0,0.0,0.0,...,0.0,100.0,0.0,0.0,0.0,90.0,0.0,0.00,121.0,0.0
7,0,8,39.0,0.0,0.0,0.0,0.0,36.0,0.0,0.0,...,0.0,100.0,0.0,0.0,0.0,90.0,0.0,0.00,129.0,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115922,31655,3,23.0,0.0,56.8,0.0,0.0,0.0,11.8,0.0,...,0.0,100.0,0.0,0.0,37.1,125.0,0.0,0.00,0.0,0.0
115921,31655,2,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,100.0,0.0,0.0,0.0,131.0,0.0,6.27,120.0,0.0
115920,31655,1,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0
115924,31655,5,23.0,0.0,0.0,0.0,0.0,36.0,0.0,0.0,...,0.0,100.0,0.0,0.0,0.0,119.0,0.0,0.00,120.0,0.0


## Subtask 1 : Binary Classication for test need prediction
Based on the biometric data collected during the first 12h of hospital stay we wish to predict if the patient will need one of the 10 test, reported below.


#### Define the SVM classifier

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True))

In [9]:
#classifiers = np.array([])
#iteration = 0
#for l in labels_clean.T:
    #l is a single vector of ground truth labels
#    iteration += 1
#    print(iteration)
#    print(l.shape)
#    classifiers = np.append(classifiers,clf.fit(patients,l))
#    print(classifiers)

In [10]:
print(labels_clean[0].shape)
print(patients.shape)
clf.fit(patients,labels_clean[0])

(18995,)
(18995, 37)


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto', probability=True))])

In [11]:
print(test_features.shape)
predictions = clf.predict_log_proba(test_features)

(151968, 37)


In [33]:
probabilities = np.exp(predictions)
#This vector contains the probabilities that the test is one (necessary) or 0 (not necessary) (check its shape its #of patients X 2)
#We want to assess only the prob that the test is necessary so we keep only tehe first row 
prediction = probabilities.T[0] #Correct shape
prediction

array([0.88882054, 0.91368932, 0.88663888, ..., 0.86474267, 0.88560247,
       0.85196818])

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
#Define Generic Pipeline 
clf = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True))
iteration = 0
final = np.array([])
#For each test
for l in labels_clean:
    #l is a single vector of ground truth labels
    iteration += 1
    #Fit Pipeline for a particular label
    print(patients.shape,l.shape)
    clf.fit(patients,l)
    #Predict The label
    predictions = np.exp(clf.predict_log_proba(test_features))
    #Append in prediction matrix (for each pid, for each label)
    final = np.array([final,probabilities.T[0]])

(18995, 37) (18995,)


## Subtask 2 : Binary Classification for Sepsi Prediction
##### In this second task we aim to predict, if, for a single patient, sepsi will occur during his hospitla stay

In [12]:
Sepsi = training_labels['LABEL_Sepsis']
clf = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True))
probs = np.exp(clf.predict_log_proba(patients[pid].reshape(-1,37)))

NameError: name 'pid' is not defined

## Subtask 3 : Predicting the mean values of vital signs with Regression
##### In this second task we aim to predict the mean of a certain vital sign during the remaining hospital stay of the patient

##### Our regressor will take as input the 12h informations from each patient, according to his pid. 
##### As output it will have the predicted average of vital signs in the following 12 hours, computed according to the test_set of vital signs in the next 12 hours, averaged.

In [None]:
patients # contains the 12 hours information about the patients 
test_features #Contains the avergae of vitsal signs in the successive 12 hours

##### The PID number of course is not a regression variable, neither the time but temporal informations might be useful as inputs. 

In [None]:
patients = patients[2:] # Get rid of PID and TIME
test_features = test_features[3:]  # Get rid of PID, TIME and AGE

In [None]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=0)
clf.fit(patients,test_features)