# TASK 2
Current considerations:
1. Current public score ~0.756 HardBaseline : 0.772478169274
- Experiment with different parametrizations and models (currently only one per subtask)
- Experiment with preprocessing (maybe I'll try to use PCA and see if it can make a difference)

In [44]:
print("Current scores for subtask 1, 2 and 3:")
print(avg_s1,avg_s2,avg_s3)

print("Average score:")
sub_avg =np.average([avg_s1, avg_s2, avg_s3])
print(sub_avg)

print("Overall improvement needed (per subtask)")
print((0.772478169274 - sub_avg)*3)

Current scores for subtask 1, 2 and 3:
[0.79939] [0.70877] [0.74617]
Average score:
0.7514460305868481
Overall improvement needed (per subtask)
0.06309641606145577


## Libraries - needed

In [1]:
# Needed libraries
import pandas as pd #Pandas
import numpy as np #Numpy
import sklearn #Sklearn
from sklearn import datasets, linear_model
from sklearn.datasets import make_regression
import math

#libraries needed for preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

#Libraries needed for imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#Libraries needed for models
#subtasks 1 and 2
from sklearn import svm
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import ClassifierChain
#subtask3
from sklearn.linear_model import RidgeCV

#Libraries needed for plotters
from matplotlib import pyplot as plt

#Libraries needed for scoring
import sklearn.metrics as metrics

np.set_printoptions(precision=5,suppress=True, linewidth=300)


## Data Manipulation functions - needed

### Auxiliary functions
1. check_all_nan 
- zeroize
- scale
- get_next_val
- patient_pca
- get_patient_matrix

In [2]:
def check_all_nan(vector):
    checker = np.vectorize(np.isnan)
    return np.all(checker(vector))

In [3]:
def zeroize(vector):
    for i in range(vector.size):
        vector[i] = 0
    return vector

In [4]:
def get_next_val(vector,index):
    count = 0
    for j in range(vector.size - index):
        if(np.isnan(vector[index + j]) == False):
            return (count,vector[index + j])
        count += 1
    return (count,np.nan)

In [5]:
def patient_pca(data,c):
    pca = PCA(n_components=c).fit(data)
    return np.reshape(pca.components_,(pca.components_.size,1))[0]

In [6]:
def get_patient_matrix(raw_data,i):
    n,w = raw_data.shape
    return raw_data[(12 * i): (12 *(i+1))][:,3:w]

### Imputation functions
1. interp
- nan_imputer - imputes data using sklearn InterativeImputer function


In [7]:
def interp(vector):
   if(check_all_nan(vector)):
       return zeroize(vector)
   prev_val = np.nan
   for i in range(vector.size):
        nans,next_val = get_next_val(vector, i)
        if(np.isnan(vector[i])):
            if(np.isnan(prev_val)):
                vector[i] = next_val
            elif(np.isnan(next_val)):
                vector[i] = prev_val
            else:
                temp = prev_val +  (next_val - prev_val)/ (nans + 1)
                vector[i] = temp
                prev_val = temp
        else:
            prev_val = vector[i]
   return vector

In [8]:
def nan_imputer(nds,method):
    """
    Given a dataset removes NaNs using
    Parameters:
    Input nds - Numpy array: dataset
    Input method - method of imputation to use
    Output nds_xnan - Numpy array: dataset without NaNs
    """
    if method==1:#Sklearn: IterativeImputer, removes NaN considering other features
        imp = IterativeImputer(max_iter=100, random_state=0)
        imp.fit(nds)
        IterativeImputer(random_state=0)
        nds_xnan = imp.transform(nds)
    return nds_xnan

### Reduction functions
1. time_reduction
- flatten?

In [9]:
def time_reduction(nds,labels, time, method):
    """
    Given a dataset containing data on consecutive hours outputs a row extracting information time features
    Parameters:
    Input nds - numpy dataset
    Input labels - list labels of dataset
    Input time - time in hours to compress data 
    Input method - method of reduction to use
    Output nds_reduced - dataset compressed 
    """
    nds = pd.DataFrame(nds,columns=labels)
    datalen = len(nds)
    numpat = int(datalen / time) #number of patients
    
    if method==1:#average of values per patient
        #Reduce by taking mean of columns for each patient
        nds_reduced = nds.groupby('pid',sort=False,as_index=False).mean()
        
    elif method==2:#scoring method based on evolution of patient during stay
        dss = np.array_split(nds,numpat,axis=0) #dataset split for each patient  
        nds_reduced = []
        flagr=True
        for k in range(numpat):#for each patient
            patient = dss[k]#select patient
            npat = patient.to_numpy()
            r_pat = []
            for i in range (np.size(npat,1)):#for each label
                cur_col = npat[:,i]
                temp=0
                ev = 0
                flagn = True
                for j in range(np.size(cur_col)):#for each row
                    this=cur_col[j]
                    if ~(np.isnan(this)):
                        ev = ev + ((this-temp)*(j+1)/10) #evolution increasing on time
                        temp=this
                        flagn= False
                if flagn:#if row is all NaN
                    r_pat = np.append(r_pat,np.NaN)#insert NaN
                else:#else
                    r_pat = np.append(r_pat,ev)#insert evolution
            if flagr:#if reduced set is empty
                nds_reduced = np.append(nds_reduced,r_pat)#insert patient
                flagr=False
            else:#if at least one patient has been added
                nds_reduced = np.vstack((nds_reduced, r_pat))#insert patient as row
        #Transform to pandas for compatibility
        nds_reduced=pd.DataFrame(nds_reduced,columns=labels)
    #Reduce considering patient evolution during stay 
    return nds_reduced.to_numpy()

### Combining functions
1. clean_set
- flatten_min_max_slope

In [10]:
def clean_set(nds,headers,imp_method,time_method,sequence):
    """
    Given a dataset containing data on consecutive hours outputs a row extracting information time features
    Parameters:
    Input nds - numpy dataset
    Input headers - list headers of dataset
    Input imp_method - time in hours to compress data 
    Input time_method - method of reduction to use
    Input sequence - method of reduction to use
    Output  - dataset compressed 
    """
    ds_clean = nds
    if sequence:
        ds_clean = time_reduction(ds_clean, headers, 12,time_method)
        ds_clean = nan_imputer(ds_clean,imp_method)
        ds_clean = pd.DataFrame(ds_clean, columns=headers)
    else:
        ds_clean = nan_imputer(ds_clean,imp_method)    
        ds_clean = time_reduction(ds_clean, headers, 12,time_method)
        ds_clean = pd.DataFrame(ds_clean, columns=headers)
    
        
    return ds_clean

In [11]:
def flatten_min_max_slope(raw_data):
    n,w =  raw_data.shape
    c = w - 3
    means = np.nanmean(raw_data,axis=0)
    ndiv = int(n/12)
    res = np.zeros((ndiv,1 + c * 3))
    temp = np.zeros((12,c))
    for i in range(ndiv):
        temp = get_patient_matrix(raw_data,i)
        for j in range (c):
            temp[:,j] = interp(temp[:,j])
        res[i][0] = raw_data[i * 12][2]
        for j in range (c):
            min = np.min(temp[:,j])
            max = np.max(temp[:,j])
            # if(min == 0):
            #     min = means[j + 3]
            # if(max == 0):
            #     max = means[j + 3]
            res[i][j*3+1] = min
            res[i][j*3+2] = max
            # res[i][j*3 + 2] = 0
            res[i][j*3+3] = (max-min)/12
    # print(res[0:5])
    return res

In [12]:
def flatten_pca(raw_data):
    n,w = raw_data.shape
    c=w-3
    ndiv = int(n/12)
    res = np.zeros((ndiv,1 + c * 3))
    temp = np.zeros((12,c))
    for i in range(ndiv):
        temp = get_patient_matrix(raw_data,i)
        for j in range (c):
            temp[:,j] = interp(temp[:,j])

        res[i][0] = raw_data[i * 12][2] /100
        # print(patient_pca(temp,c))
        res[i][1:c*w-2] = patient_pca(temp,c)
    return res

## Model choosing functions

In [13]:
def data_fold_10(nds, f_nr, task):
    """
    Given a dataset, outputs two subsets: training set and test set. Test sets is given by the f_nr-th partition of the dataset,
    meanwhile the training set is the remaining of the dataset
    Parameters:
    Input ds - numpy dataset to partition
    Input f_nr - number of the fold that will be the test set 1-10
    Output (testset, trainingset) - tuple containing the test set and training set
    """
    dss = np.array_split(nds,10,axis=0) #dataset split
    testset = dss[f_nr] #test set
    if task==2:
        trainingset = np.hstack(np.delete(dss, f_nr, 0)) #training set 
    else:
        trainingset = np.vstack(np.delete(dss, f_nr, 0)) #training set
    return (testset, trainingset)

In [14]:
def fold10_predict(models, ndsx, ndsy, ndsy_L, task):
    """
    Do 10 fold cross validation on training set for one of the three subtasks
    Parameters:
    Input models - list of models 
    Input ndsx - training features
    Input ndsy - training labels
    Input ndsy_L - training labels headers
    Input task - subtask to fold
    Output nscores - list containing the scores of each fold
    """
    ###Performing 10-fold Cross Validation for each Model
    mlen = len(models) #Number of models
    nscores = np.zeros((mlen,10)) #score of each fold
    for j in range(10):
        #Creating test set and training set from data set 
        if task==2:
            (tes_x, trs_x) = data_fold_10(ndsx, j, task-1)
            (tes_y, trs_y) = data_fold_10(ndsy, j, task)
        else:
            (tes_x, trs_x) = data_fold_10(ndsx, j, task)
            (tes_y, trs_y) = data_fold_10(ndsy, j, task)

        #Perform fitting and predicting for each model
        for i in range(mlen):
            models[i].fit(trs_x,trs_y)
            if task==3:#if task is third we use predict
                tes_yp = models[i].predict(tes_x)

                #Transform into DataFrame for scoring
                df_y = pd.DataFrame(tes_y, columns=ndsy_L)
                df_yp = pd.DataFrame(tes_yp, columns=ndsy_L)
                nscores[i,j] = scores(df_y,df_yp, task)
            else:
                tes_yp = predict_sigmoid(models[i],tes_x)
                #Transform into DataFrame for scoring
                df_y = pd.DataFrame(tes_y, columns=ndsy_L)
                if task==2:
                    df_yp = pd.DataFrame(tes_yp, columns=ndsy_L)
                else:
                    df_yp = pd.DataFrame(tes_yp, columns=ndsy_L)
                nscores[i,j] = scores(df_y,df_yp, task)
    return nscores

In [15]:
def scores(tes_y, tes_yp, task):
    """
    Give score for one of the subtask
    Parameters:
    Input tes_y - training labels
    Input tes_yp - predicted labels
    Input task - subtask
    Output score - score of the subtask
    """
    if task==3:
        VITALS = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
        score = np.mean([0.5 + 0.5 * np.maximum(0, metrics.r2_score(tes_y[entry], tes_yp[entry])) for entry in VITALS])
    elif task==2:
        score = metrics.roc_auc_score(tes_y['LABEL_Sepsis'], tes_yp['LABEL_Sepsis'])
    else:
        TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
        score = np.mean([metrics.roc_auc_score(tes_y[entry], tes_yp[entry]) for entry in TESTS])
    
    return score

In [16]:
def sigmoid(x):
    return 1/(1 + math.exp(-x))

In [17]:
def map_sigmoid(X):
    xshape = X.shape
    if len(xshape)==1:
        n = xshape[0]
        for i in range(n):
            X[i] = sigmoid(X[i])
    else:
        n,m = xshape
        for i in range(n):
            for j in range(m):
                X[i][j] = sigmoid(X[i][j])
    return X

In [18]:
def predict_sigmoid(clf,data):
    return map_sigmoid(clf.decision_function(data))

In [19]:
def set_scaler(scal,dataset):
    """
    Give score for one of the subtask
    Parameters:
    Input scaler - scaler to fit
    Input data - data 
    Output fit_scaler - scaler fitted on data
    """
    return scal.fit(dataset)

# Subtasks - Setup

In [20]:
## Setup datasets and scalers for all subtasks

#Extracting training labels and features
dataset_y = pd.read_csv("train_labels.csv")
dataset_x = pd.read_csv("train_features.csv")

#lists that contain header of dataset
dataset_x_L = list(dataset_x)

#Clean set
cds = clean_set(dataset_x,dataset_x_L,1,1,True)

#Matt dataset
raw_data = np.genfromtxt("./train_features.csv",delimiter=",",skip_header=1)
mdata = flatten_min_max_slope(raw_data)

#Prepare all scalers 
scds = StandardScaler()
scds.fit(cds)
mmcds = MinMaxScaler()
mmcds.fit(cds)
rcds = RobustScaler()
rcds.fit(cds)
ncds = Normalizer()

sm = StandardScaler()
sm.fit(mdata)
mmm = MinMaxScaler()
mmm.fit(mdata)
rm = RobustScaler()
rm.fit(mdata)

#Training Labels
#Subtask1
dataset_y1 = dataset_y.loc[:,"LABEL_BaseExcess":"LABEL_EtCO2"] #Labels to be predicted in [0,1] range
ds_y1_L = list(dataset_y1) #headers of labels
ndsy1 = dataset_y1.to_numpy() #to numpy

#Subtask2
dataset_y2 = dataset_y.loc[:,"LABEL_Sepsis"] #Labels to be predicted in [0,1] range
ds_y2_L = ["LABEL_Sepsis"] #headers of labels
ndsy2 = dataset_y2.to_numpy() #to numpy

#Subtask3
dataset_y3 = dataset_y.loc[:,"LABEL_RRate":"LABEL_Heartrate"] #Labels to be predicted
ds_y3_L = list(dataset_y3) #headers of labels
ndsy3 = dataset_y3.to_numpy() #to numpy

# Subtask1 training models

In [21]:
#subtask 1
#Training Features
#Scale dataset, options: StandardScaler, MinMaxScaler and RobustScaler
#sds_p1 = pd.DataFrame(scds.transform(cds),columns=dataset_x_L)
#sds_p1 = pd.DataFrame(mmcds.transform(cds),columns=dataset_x_L)
#sds_p1 =pd.DataFrame(rcds.transform(cds),columns=dataset_x_L)
sds_p1 =pd.DataFrame(ncds.transform(cds),columns=dataset_x_L)

##Division for the prediction, probabilities divided from the real values
ds_p1 = sds_p1.loc[:,"Time":"pH"] #reduced dataset for prediction, without pid
#labels of datasets
ds_p1_L = list(ds_p1)
#transform into numpy
nds_p1 = ds_p1.to_numpy()


# Matts datasets: options: StandardScaler,MinMaxScaler and RobustScaler
mnds_p1 = sm.transform(mdata)
#mnds_p1 = mmm.transform(mdata)
#mnds_p1 = rm.transform(mdata)

In [22]:
#Model set used for training
lsvc_m1 = sklearn.svm.LinearSVC(random_state=0,
                                dual=False, 
                                tol=0.00001,
                                C=5,
                                max_iter=100000,
                                fit_intercept=True
                               )
lsvc_m2 = sklearn.svm.LinearSVC(random_state=0,
                                penalty='l1',
                                loss = 'squared_hinge',
                                C=0.019,
                                dual=False, 
                                tol=0.001,
                                max_iter=100000,
                                fit_intercept=True
                               )

matt_svcms = svm.SVC(kernel='rbf',
                     decision_function_shape='ovr', 
                     C=0.6,
                     max_iter=10000000,
                     gamma='auto'
                    )
models1 = []
#0 - Sklearn: OnveVsRestClassifier using svcms
#models1 = np.append(models1,OneVsRestClassifier(svcms))
#1 - Sklearn: OnveVsRestClassifier using lvsc_m3
models1 = np.append(models1,OneVsRestClassifier(lsvc_m2))

In [23]:
#Get Predictions
s1 = fold10_predict(models1,mnds_p1,ndsy1,ds_y1_L,1)
pd.DataFrame(s1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.801225,0.811822,0.809428,0.795182,0.79823,0.78475,0.800129,0.791643,0.796676,0.80482


In [24]:
avg_s1 = np.average(s1,axis=1)
pd.DataFrame(avg_s1)

Unnamed: 0,0
0,0.79939


In [25]:
avg_s1

array([0.79939])

# Subtask2 - training models

In [26]:
#subtask 2
#Training Features
sds_p2 = pd.DataFrame(scds.transform(cds),columns=dataset_x_L)
#sds_p2 = pd.DataFrame(mmcds.transform(cds),columns=dataset_x_L)
#sds_p2 =pd.DataFrame(rcds.transform(cds),columns=dataset_x_L)

ds_p2 = sds_p2.loc[:,"Time":"pH"] #reduced dataset for prediction, without pid
ds_p2_L = list(ds_p2)
nds_p2 = ds_p2.to_numpy()

# Matts datasets
mnds_p2 = sm.transform(mdata)
#mnds_p2 = mmm.transform(mdata)
#mnds_p2 = rm.transform(mdata)

In [27]:
#Model list used for predictions
models2 = []
##mattds, 0.07

svc_m0 = sklearn.svm.SVC(C=0.0019, 
                         kernel='rbf',
                         degree=3,
                         gamma='scale',
                         coef0=0.0,
                         shrinking=True,
                         probability=False,
                         tol=0.001,
                         cache_size=200,
                         class_weight=None,
                         verbose=False, 
                         max_iter=-1,
                         decision_function_shape='ovr', 
                         random_state=None)

lsvc_m1 = sklearn.svm.LinearSVC(random_state=0,
                                penalty='l2',
                                loss = 'squared_hinge',
                                C=0.0019,
                                dual=False, 
                                tol=0.001,
                                max_iter=100000,
                                fit_intercept=True
                               )


#1 - Sklearn: SVR
#models2 = np.append(models2,matt_svcms)
#from sklearn.svm import LinearSVC
models2 = np.append(models2,lsvc_m1)
#models2 = np.append(models2,svc_m0)

In [28]:
#Get predictions
s2 = fold10_predict(models2,mnds_p2,ndsy2,ds_y2_L,2)

In [29]:
pd.DataFrame(s2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.658998,0.715432,0.725145,0.688129,0.736282,0.712471,0.733558,0.742257,0.691423,0.684032


In [30]:
avg_s2 = np.average(s2,axis=1)
pd.DataFrame(avg_s2)

Unnamed: 0,0
0,0.708773


In [31]:
#current best, matt's standard and 
lsvc_m1 = sklearn.svm.LinearSVC(random_state=0,
                                penalty='l1',
                                loss = 'squared_hinge',
                                C=0.019,
                                dual=False, 
                                tol=0.001,
                                max_iter=100000,
                                fit_intercept=True
                               )

# Subtask3 - training models
I included only ridge as it is relatively fast and produces ~0.745

In [32]:
#subtask 3
#Training Features
#sds_p3 = pd.DataFrame(scds.transform(cds),columns=dataset_x_L)
sds_p3 = pd.DataFrame(mmcds.transform(cds),columns=dataset_x_L)
#sds_p3 =pd.DataFrame(rcds.transform(cds),columns=dataset_x_L)
#sds_p3 =pd.DataFrame(ncds.transform(cds),columns=dataset_x_L)

ds_p3 = sds_p3.loc[:,"Time":"pH"] #reduced dataset for prediction, without pid
ds_p3_L = list(ds_p3)
nds_p3 = ds_p3.to_numpy()

# Matts datasets
mnds_p3 = sm.transform(mdata)
#mnds_p3 = mmm.transform(mdata)
#mnds_p3 = rm.transform(mdata)

In [33]:
#Models

rm0 = sklearn.linear_model.Ridge(alpha=0.112,
                                      fit_intercept=True,
                                      normalize=False, 
                                      copy_X=True,
                                      max_iter=100000,
                                      tol=1e-6,
                                      solver='auto',
                                      random_state=None
                                     )



lm0 = sklearn.linear_model.Lasso(alpha=0.0001,
                                      fit_intercept=True, 
                                      normalize=True, 
                                      precompute=True, 
                                      copy_X=True, 
                                      max_iter=10000000, 
                                      tol=0.0001, 
                                      warm_start=False, 
                                      positive=True, 
                                      random_state=None, 
                                      selection='random')



rcv = sklearn.linear_model.RidgeCV(alphas=[0 + 0.001*a for a in range(1,100)],
                                   fit_intercept=True, 
                                   normalize=False,
                                   scoring=None,
                                   cv=None,
                                   gcv_mode=None,
                                   store_cv_values=False
                                  )

lcv = sklearn.linear_model.LassoCV(eps=0.001,
                                   n_alphas=100,
                                   alphas=None,
                                   fit_intercept=True,
                                   normalize=True, 
                                   precompute='auto',
                                   max_iter=1000, 
                                   tol=0.0001,
                                   copy_X=True,
                                   cv=5, 
                                   verbose=False,
                                   n_jobs=None,
                                   positive=False,
                                   random_state=None, 
                                   selection='cyclic'
                                  )

mlcv = sklearn.linear_model.MultiTaskLassoCV(eps=0.001,
                                            n_alphas=1000,
                                            alphas=None,
                                            fit_intercept=True,
                                            normalize=True,
                                            max_iter=1000, 
                                            tol=0.0001, 
                                            copy_X=True,
                                            cv=5, 
                                            verbose=False,
                                            n_jobs=None, 
                                            random_state=None, 
                                            selection='cyclic'
                                           )
##Model set used for training
models3 = []
#0 - Sklearn: Ridge regression function with alpha 1
models3 = np.append(models3, MultiOutputRegressor(rm0)) 
#models3 = np.append(models3, rcv) 
#models3 = np.append(models3, mlcv)
#models3 = np.append(models3, lm0) 
#models3 = np.append(models3, lm1)
#1 Sklearn: Multitask Elastic Net with Cross Validation
#models3 = np.append(models3,mtencv)

In [34]:
#Get predictions
s3 = fold10_predict(models3, nds_p3, ndsy3, ds_y3_L, 3)

In [35]:
pd.DataFrame(s3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.749344,0.746425,0.740742,0.744761,0.732403,0.767885,0.743762,0.732367,0.749157,0.754903


In [36]:
avg_s3 = np.average(s3,axis=1)
pd.DataFrame(avg_s3)

Unnamed: 0,0
0,0.746175


# Testset prediction - in progress

In [37]:
#Dummy code to combine three results
best_1 = models1[0]#Dummy
best_2 = models2[0]#Dummy
best_3 = models3[0]#Dummy

#training best model on entire dataset
best_1.fit(mnds_p1,ndsy1)
best_2.fit(mnds_p2,ndsy2)
best_3.fit(nds_p3,ndsy3)

#extract dataset to predict
testset_x = pd.read_csv("test_features.csv")
testset_x_L = list(testset_x)
test_x = testset_x.to_numpy()

test_raw = np.genfromtxt("./test_features.csv",delimiter=",",skip_header=1)

#cleaning data
test12_x = flatten_min_max_slope(test_raw)
test3_x = clean_set(testset_x,testset_x_L,1,1,True)

#scale data
test12_x = sm.transform(test12_x)
test3_x = pd.DataFrame(mmcds.transform(test3_x),columns=dataset_x_L)

#reduced dataset for prediction, without pid
#ctes1 = test1_x.loc[:,"Time":"pH"] 
#ctes2 = test2_x.loc[:,"Time":"pH"]
ctes3 = test3_x.loc[:,"Time":"pH"]

In [38]:
#prediction using best models for each subtask
pred1 = predict_sigmoid(best_1,test12_x)
pred2 = predict_sigmoid(best_2,test12_x)
pred3 = best_3.predict(ctes3)

In [39]:
#Conversion to df
pred1 = pd.DataFrame(pred1,columns=ds_y1_L)
pred2 = pd.DataFrame(pred2,columns=ds_y2_L)
pred3 = pd.DataFrame(pred3,columns=ds_y3_L)

In [40]:
#Adding the pids and assemble final prediction
pids = time_reduction(testset_x,list(testset_x),12,1)
pd.DataFrame(pids)
pids = pd.DataFrame(pids[:,0],columns=['pid'])
pred = pd.concat([pids.astype(int),pred1,pred2, pred3], axis=1)
pred

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.707593,0.654844,0.993288,0.985365,0.991291,0.486329,0.242553,0.337295,0.372765,0.270237,0.342013,14.012866,83.416372,98.748881,82.715085
1,10001,0.329964,0.289032,0.398904,0.398247,0.398920,0.320671,0.325067,0.315605,0.279186,0.277493,0.285905,18.266961,88.308131,94.943706,102.685540
2,10003,0.387979,0.280745,0.343495,0.345677,0.347167,0.368191,0.297728,0.490090,0.285779,0.287705,0.277381,18.828149,81.412713,97.769234,92.562836
3,10004,0.263019,0.286903,0.410673,0.410492,0.405745,0.298992,0.316362,0.299407,0.284264,0.290991,0.280757,16.559621,72.310694,95.809019,88.179003
4,10005,0.339288,0.274910,0.297833,0.294815,0.300466,0.305847,0.286129,0.313274,0.269874,0.231511,0.288193,19.385573,74.566437,95.965975,61.564921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,9989,0.484036,0.282942,0.362074,0.356410,0.354223,0.418644,0.232480,0.363094,0.272366,0.227755,0.330300,19.751896,79.975079,95.726405,103.082205
12660,9991,0.515473,0.329618,0.376097,0.355090,0.359779,0.407493,0.256220,0.250685,0.266227,0.295365,0.318244,18.190472,95.584866,98.851082,74.358144
12661,9992,0.507762,0.289909,0.325219,0.320365,0.323363,0.366714,0.250529,0.517562,0.275168,0.262315,0.286965,18.762833,69.064481,97.343220,84.219772
12662,9994,0.930942,0.569146,0.698526,0.739675,0.789949,0.889366,0.197899,0.791365,0.313319,0.240242,0.421104,15.934689,86.604429,98.528002,96.415236


In [41]:
#combine predictions into one dataframe
#output, 3 digit floats
pred.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')

In [42]:
#check format
df_submission = pd.read_csv('prediction.zip')
df_sample = pd.read_csv('sample.zip')
df_submission.shape == df_sample.shape

True

In [43]:
last_sol = pd.read_csv('./predictions/0.74/prediction.zip')
last_sol

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.542,0.485,0.596,0.599,0.602,0.436,0.294,0.409,0.308,0.300,0.341,12.008,80.057,98.734,82.152
1,10001,0.261,0.267,0.270,0.270,0.270,0.268,0.267,0.265,0.269,0.268,0.288,18.145,88.704,94.911,102.911
2,10003,0.257,0.270,0.255,0.258,0.261,0.220,0.267,0.305,0.266,0.236,0.279,18.151,83.593,97.850,91.324
3,10004,0.259,0.261,0.248,0.247,0.246,0.267,0.263,0.275,0.263,0.267,0.284,16.794,72.866,95.782,88.420
4,10005,0.230,0.260,0.251,0.251,0.257,0.256,0.253,0.249,0.264,0.217,0.291,20.202,75.970,95.932,61.871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,9989,0.560,0.276,0.274,0.279,0.276,0.306,0.238,0.292,0.269,0.242,0.334,19.832,80.770,95.694,103.481
12660,9991,0.460,0.317,0.393,0.384,0.389,0.377,0.278,0.349,0.272,0.280,0.321,16.405,92.036,98.821,74.250
12661,9992,0.434,0.275,0.259,0.261,0.263,0.287,0.249,0.631,0.266,0.260,0.289,19.432,71.552,97.368,84.167
12662,9994,0.497,0.406,0.524,0.527,0.533,0.511,0.304,0.451,0.311,0.315,0.425,15.256,85.095,98.470,96.872
