# TASK 2

## Libraries - needed

In [28]:
# General libraries
import pandas as pd #Pandas
import numpy as np #Numpy
import sklearn #Sklearn
import math
from numpy.random import seed


#libraries needed for preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#Libraries needed for imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#Libraries needed for models
#subtasks 1 and 2
from sklearn import datasets, linear_model
from sklearn.datasets import make_regression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor
#subtask3
from sklearn.linear_model import RidgeCV

#Libraries needed for plotters
from matplotlib import pyplot as plt

#Libraries needed for scoring
import sklearn.metrics as metrics

np.set_printoptions(precision=5,suppress=True, linewidth=300)

## Data Manipulation functions

### Auxiliary functions
1. check_all_nan - checks if column is all NaN
- zeroize - returns zero vector
- get_next_val
- get_patient_matrix - returns matrix containing patient data

In [29]:
def check_all_nan(vector):
    checker = np.vectorize(np.isnan)
    return np.all(checker(vector))

In [30]:
def zeroize(vector):
    for i in range(vector.size):
        vector[i] = 0
    return vector

In [31]:
def get_next_val(vector,index):
    count = 0
    for j in range(vector.size - index):
        if(np.isnan(vector[index + j]) == False):
            return (count,vector[index + j])
        count += 1
    return (count,np.nan)

In [32]:
def get_patient_matrix(raw_data,i):
    n,w = raw_data.shape
    return raw_data[(12 * i): (12 *(i+1))][:,3:w]

### Imputation functions
1. interp
- nan_imputer - imputes data using sklearn InterativeImputer function


In [33]:
def interp(vector):
   if(check_all_nan(vector)):
       return zeroize(vector)
   prev_val = np.nan
   for i in range(vector.size):
        nans,next_val = get_next_val(vector, i)
        if(np.isnan(vector[i])):
            if(np.isnan(prev_val)):
                vector[i] = next_val
            elif(np.isnan(next_val)):
                vector[i] = prev_val
            else:
                temp = prev_val +  (next_val - prev_val)/ (nans + 1)
                vector[i] = temp
                prev_val = temp
        else:
            prev_val = vector[i]
   return vector

In [34]:
def nan_imputer(nds,method):
    """
    Given a dataset removes NaNs using
    Parameters:
    Input nds - Numpy array: dataset
    Input method - method of imputation to use
    Output nds_xnan - Numpy array: dataset without NaNs
    """
    if method==1:#Sklearn: IterativeImputer, removes NaN considering other features
        imp = IterativeImputer(max_iter=100, random_state=0)
        imp.fit(nds)
        IterativeImputer(random_state=0)
        nds_xnan = imp.transform(nds)
    return nds_xnan

### Reduction functions
1. time_reduction - reduces dataset by taking mean of columns per patient

In [35]:
def time_reduction(nds,labels, time, method):
    """
    Given a dataset containing data on consecutive hours outputs a row extracting information time features
    Parameters:
    Input nds - numpy dataset
    Input labels - list labels of dataset
    Input time - time in hours to compress data 
    Input method - method of reduction to use
    Output nds_reduced - dataset compressed 
    """
    nds = pd.DataFrame(nds,columns=labels)
    datalen = len(nds)
    numpat = int(datalen / time) #number of patients
    
    if method==1:#average of values per patient
        #Reduce by taking mean of columns for each patient
        nds_reduced = nds.groupby('pid',sort=False,as_index=False).mean()
    return nds_reduced.to_numpy()

### Combining functions
1. clean_set - combines time_reduction and nan_imputer functions
- flatten_min_max_slope - reduces datasets and applies interp function

In [36]:
def clean_set(nds,headers,imp_method,time_method,sequence):
    """
    Given a dataset containing data on consecutive hours outputs a row extracting information time features
    Parameters:
    Input nds - numpy dataset
    Input headers - list headers of dataset
    Input imp_method - time in hours to compress data 
    Input time_method - method of reduction to use
    Input sequence - method of reduction to use
    Output  - dataset compressed 
    """
    ds_clean = nds
    if sequence:
        ds_clean = time_reduction(ds_clean, headers, 12,time_method)
        ds_clean = nan_imputer(ds_clean,imp_method)
        ds_clean = pd.DataFrame(ds_clean, columns=headers)
    else:
        ds_clean = nan_imputer(ds_clean,imp_method)    
        ds_clean = time_reduction(ds_clean, headers, 12,time_method)
        ds_clean = pd.DataFrame(ds_clean, columns=headers)
    
        
    return ds_clean

In [37]:
def flatten_min_max_slope(raw_data):
    n,w =  raw_data.shape
    c = w - 3
    means = np.nanmean(raw_data,axis=0)
    ndiv = int(n/12)
    res = np.zeros((ndiv,1 + c * 3))
    temp = np.zeros((12,c))
    for i in range(ndiv):
        temp = get_patient_matrix(raw_data,i)
        for j in range (c):
            temp[:,j] = interp(temp[:,j])
        res[i][0] = raw_data[i * 12][2]
        for j in range (c):
            min = np.min(temp[:,j])
            max = np.max(temp[:,j])
            res[i][j*3+1] = min
            res[i][j*3+2] = max
            res[i][j*3+3] = (max-min)/12
    return res

# Model functions
1. sigmoid - applies sigmoid function to input
- map_sigmoid - applies sigmoid to dataset
- predict_sigmoid - maps the decision function of the model to sigmoid

In [38]:
def sigmoid(x):
    return 1/(1 + math.exp(-x))

In [39]:
def map_sigmoid(X):
    xshape = X.shape
    if len(xshape)==1:
        n = xshape[0]
        for i in range(n):
            X[i] = sigmoid(X[i])
    else:
        n,m = xshape
        for i in range(n):
            for j in range(m):
                X[i][j] = sigmoid(X[i][j])
    return X

In [40]:
def predict_sigmoid(clf,data):
    return map_sigmoid(clf.decision_function(data))

# Subtasks - Setup
1. Data for subtask 1 and 2: cleaned with flatten_min_max_slope function and then scaled with sklearn: StandardScaler
2. Data for subtask 3: cleaned with clean_set function and then scaled with sklearn: MinMaxScaler

In [41]:
seed(1)
## Setup datasets and scalers for all subtasks

#Extracting training labels and features
dataset_y = pd.read_csv("train_labels.csv")
dataset_x = pd.read_csv("train_features.csv")

#lists that contain header of dataset
dataset_x_L = list(dataset_x)

#Clean set
cds = clean_set(dataset_x,dataset_x_L,1,1,True)

#Matt dataset
raw_data = np.genfromtxt("./train_features.csv",delimiter=",",skip_header=1)
mdata = flatten_min_max_slope(raw_data)

#Prepare all scalers 
mmcds = MinMaxScaler()
mmcds.fit(cds)

sm = StandardScaler()
sm.fit(mdata)

#Training Features
#Subtasks 1 and 2 
mnds_p12 = sm.transform(mdata)
#Subtask 3
sds_p3 = pd.DataFrame(mmcds.transform(cds),columns=dataset_x_L)
ds_p3 = sds_p3.loc[:,"Time":"pH"] #reduced dataset for prediction, without pid
ds_p3_L = list(ds_p3) #Headers
nds_p3 = ds_p3.to_numpy() #to numpy

#Training Labels
#Subtask1
dataset_y1 = dataset_y.loc[:,"LABEL_BaseExcess":"LABEL_EtCO2"] #Labels to be predicted in [0,1] range
ds_y1_L = list(dataset_y1) #headers of labels
ndsy1 = dataset_y1.to_numpy() #to numpy

#Subtask2
dataset_y2 = dataset_y.loc[:,"LABEL_Sepsis"] #Labels to be predicted in [0,1] range
ds_y2_L = ["LABEL_Sepsis"] #headers of labels
ndsy2 = dataset_y2.to_numpy() #to numpy

#Subtask3
dataset_y3 = dataset_y.loc[:,"LABEL_RRate":"LABEL_Heartrate"] #Labels to be predicted
ds_y3_L = list(dataset_y3) #headers of labels
ndsy3 = dataset_y3.to_numpy() #to numpy

# Subtasks Models
1. Model for subtask 1: sklearn: OneVsRestClassifier using skleran: LinearSVC model
2. Model for subtask 2: sklearn: LinearSVC
3. Model for subtask 3: sklearn: Ridge

In [42]:
#Model set used for training
#Subtask1
lsvc_s1 = sklearn.svm.LinearSVC(random_state=0,
                                penalty='l1',
                                loss = 'squared_hinge',
                                C=0.019,
                                dual=False, 
                                tol=0.001,
                                max_iter=100000,
                                fit_intercept=True
                               )
#Model for subtask 1: Sklearn: OneVsRestClassifier using Linear SVC
models1 = OneVsRestClassifier(lsvc_s1)

#Subtask2
lsvc_s2 = sklearn.svm.LinearSVC(random_state=0,
                                penalty='l2',
                                loss = 'squared_hinge',
                                C=0.0008,
                                dual=False, 
                                tol=0.001,
                                max_iter=100000,
                                fit_intercept=True
                               )

#Model for subtask2: Sklearn: Linear SVC
models2 = lsvc_s2

#Model for subtask3: Sklearn: Ridge
rm_s3 = sklearn.linear_model.Ridge(alpha=0.5,
                                      fit_intercept=True,
                                      normalize=False, 
                                      copy_X=True,
                                      max_iter=100000,
                                      tol=1e-6,
                                      solver='auto',
                                      random_state=None
                                     )
models3 = MultiOutputRegressor(rm_s3)

In [43]:
#Fitting models on dataset
models1.fit(mnds_p12,ndsy1)
models2.fit(mnds_p12,ndsy2)
models3.fit(nds_p3,ndsy3)

MultiOutputRegressor(estimator=Ridge(alpha=0.5, copy_X=True, fit_intercept=True,
                                     max_iter=100000, normalize=False,
                                     random_state=None, solver='auto',
                                     tol=1e-06),
                     n_jobs=None)

# Testset prediction

In [44]:
#extract dataset to predict
testset_x = pd.read_csv("test_features.csv")
testset_x_L = list(testset_x)
test_x = testset_x.to_numpy()

test_raw = np.genfromtxt("./test_features.csv",delimiter=",",skip_header=1)

#cleaning data, remove NaNs and reduct time
test12_x = flatten_min_max_slope(test_raw)
test3_x = clean_set(testset_x,testset_x_L,1,1,True)

#scale data
test12_x = sm.transform(test12_x) #Scaled with StandardScaler
test3_x = pd.DataFrame(mmcds.transform(test3_x),columns=dataset_x_L) #Scaled with MinMaxScaler

#reduced dataset for prediction, without pid
ctes3 = test3_x.loc[:,"Time":"pH"]

In [45]:
#prediction using best models for each subtask
pred1 = predict_sigmoid(models1,test12_x)
pred2 = predict_sigmoid(models2,test12_x)
pred3 = models3.predict(ctes3)

In [46]:
#Conversion to df
pred1 = pd.DataFrame(pred1,columns=ds_y1_L)
pred2 = pd.DataFrame(pred2,columns=ds_y2_L)
pred3 = pd.DataFrame(pred3,columns=ds_y3_L)

In [47]:
#Adding the pids and assemble final prediction
pids = time_reduction(testset_x,list(testset_x),12,1)
pd.DataFrame(pids)
pids = pd.DataFrame(pids[:,0],columns=['pid'])
pred = pd.concat([pids.astype(int),pred1,pred2, pred3], axis=1)
pred

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.707593,0.654844,0.993288,0.985365,0.991291,0.486329,0.242553,0.337295,0.372765,0.270237,0.344756,14.091650,83.468762,98.663943,82.904071
1,10001,0.329964,0.289032,0.398904,0.398247,0.398920,0.320671,0.325067,0.315605,0.279186,0.277493,0.289146,18.309099,88.286604,94.984419,102.687131
2,10003,0.387979,0.280745,0.343495,0.345677,0.347167,0.368191,0.297728,0.490090,0.285779,0.287705,0.281241,18.832443,81.398159,97.753127,92.538797
3,10004,0.263019,0.286903,0.410673,0.410492,0.405745,0.298992,0.316362,0.299407,0.284264,0.290991,0.284447,16.587392,72.373648,95.836593,88.134649
4,10005,0.339288,0.274910,0.297833,0.294815,0.300466,0.305847,0.286129,0.313274,0.269874,0.231511,0.292123,19.357464,74.696847,96.005844,61.534041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,9989,0.484036,0.282942,0.362074,0.356410,0.354223,0.418644,0.232480,0.363094,0.272366,0.227755,0.333432,19.758554,80.031043,95.762265,103.046655
12660,9991,0.515473,0.329618,0.376097,0.355090,0.359779,0.407493,0.256220,0.250685,0.266227,0.295365,0.321581,18.230689,95.374560,98.778412,74.528911
12661,9992,0.507762,0.289909,0.325219,0.320365,0.323363,0.366714,0.250529,0.517562,0.275168,0.262315,0.291101,18.738353,69.116506,97.348286,84.155034
12662,9994,0.930942,0.569146,0.698526,0.739675,0.789949,0.889366,0.197899,0.791365,0.313319,0.240242,0.422001,15.993554,86.583641,98.477000,96.417445


In [48]:
#combine predictions into one dataframe
#output, 3 digit floats
pred.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')

In [49]:
#check format
df_submission = pd.read_csv('prediction.zip')
df_sample = pd.read_csv('sample.zip')
df_submission.shape == df_sample.shape

True