# TASK 2
Current considerations:
1. Current score ~0.773 HardBaseline : 0.772478169274
- Experiment with different parametrizations and models (currently only one per subtask)
- Experiment with preprocessing (maybe I'll try to use PCA and see if it can make a difference)

## Libraries - needed

In [1]:
# Needed libraries
import pandas as pd #Pandas
import numpy as np #Numpy
import sklearn #Sklearn
from sklearn import datasets, linear_model
from sklearn.datasets import make_regression

#libraries needed for preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import RobustScaler

#Libraries needed for imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#Libraries needed for models
#subtasks 1 and 2
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
#subtask3
from sklearn.linear_model import RidgeCV

#Libraries needed for plotters
from matplotlib import pyplot as plt

#Libraries needed for scoring
import sklearn.metrics as metrics

## MATT Libraries
import math
from sklearn import svm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn import linear_model
import time
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

np.set_printoptions(precision=5,suppress=True, linewidth=300)

## Data Manipulation functions - needed

### Auxiliary functions
1. check_all_nan 
- zeroize
- scale
- get_next_val
- patient_pca
- get_patient_matrix

In [2]:
def check_all_nan(vector):
    checker = np.vectorize(np.isnan)
    return np.all(checker(vector))

In [3]:
def zeroize(vector):
    for i in range(vector.size):
        vector[i] = 0
    return vector

In [4]:
def scale(data):
    n,w = data.shape
    scaler = StandardScaler(copy=False)
    scaler.fit(data)
    data = scaler.transform(data)
    return data

In [5]:
def get_next_val(vector,index):
    count = 0
    for j in range(vector.size - index):
        if(np.isnan(vector[index + j]) == False):
            return (count,vector[index + j])
        count += 1
    return (count,np.nan)

In [6]:
def patient_pca(data,c):
    pca = PCA(n_components=c).fit(data)
    return np.reshape(pca.components_,(pca.components_.size,1))[0]

In [7]:
def get_patient_matrix(raw_data,i):
    n,w = raw_data.shape
    return raw_data[(12 * i): (12 *(i+1))][:,3:w]

### Imputation functions
1. interp
- nan_imputer - imputes data using sklearn InterativeImputer function


In [8]:
def interp(vector):
   if(check_all_nan(vector)):
       return zeroize(vector)
   prev_val = np.nan
   for i in range(vector.size):
        nans,next_val = get_next_val(vector, i)
        if(np.isnan(vector[i])):
            if(np.isnan(prev_val)):
                vector[i] = next_val
            elif(np.isnan(next_val)):
                vector[i] = prev_val
            else:
                temp = prev_val +  (next_val - prev_val)/ (nans + 1)
                vector[i] = temp
                prev_val = temp
        else:
            prev_val = vector[i]
   return vector

In [9]:
def nan_imputer(nds,method):
    """
    Given a dataset removes NaNs using
    Parameters:
    Input nds - Numpy array: dataset
    Input method - method of imputation to use
    Output nds_xnan - Numpy array: dataset without NaNs
    """
    if method==1:#Sklearn: IterativeImputer, removes NaN considering other features
        imp = IterativeImputer(max_iter=10, random_state=0)
        imp.fit(nds)
        IterativeImputer(random_state=0)
        nds_xnan = imp.transform(nds)
    return nds_xnan

### Reduction functions
1. time_reduction
- flatten?

In [10]:
def time_reduction(nds,labels, time, method):
    """
    Given a dataset containing data on consecutive hours outputs a row extracting information time features
    Parameters:
    Input nds - numpy dataset
    Input labels - list labels of dataset
    Input time - time in hours to compress data 
    Input method - method of reduction to use
    Output nds_reduced - dataset compressed 
    """
    nds = pd.DataFrame(nds,columns=labels)
    datalen = len(nds)
    numpat = int(datalen / time) #number of patients
    
    if method==1:#average of values per patient
        #Reduce by taking mean of columns for each patient
        nds_reduced = nds.groupby('pid',sort=False,as_index=False).mean()
        
    elif method==2:#scoring method based on evolution of patient during stay
        dss = np.array_split(nds,numpat,axis=0) #dataset split for each patient  
        nds_reduced = []
        flagr=True
        for k in range(numpat):#for each patient
            patient = dss[k]#select patient
            npat = patient.to_numpy()
            r_pat = []
            for i in range (np.size(npat,1)):#for each label
                cur_col = npat[:,i]
                temp=0
                ev = 0
                flagn = True
                for j in range(np.size(cur_col)):#for each row
                    this=cur_col[j]
                    if ~(np.isnan(this)):
                        ev = ev + ((this-temp)*(j+1)/10) #evolution increasing on time
                        temp=this
                        flagn= False
                if flagn:#if row is all NaN
                    r_pat = np.append(r_pat,np.NaN)#insert NaN
                else:#else
                    r_pat = np.append(r_pat,ev)#insert evolution
            if flagr:#if reduced set is empty
                nds_reduced = np.append(nds_reduced,r_pat)#insert patient
                flagr=False
            else:#if at least one patient has been added
                nds_reduced = np.vstack((nds_reduced, r_pat))#insert patient as row
        #Transform to pandas for compatibility
        nds_reduced=pd.DataFrame(nds_reduced,columns=labels)
    #Reduce considering patient evolution during stay 
    return nds_reduced.to_numpy()

In [11]:
def flatten_pca(raw_data,c):
    n,w = raw_data.shape
    res = np.zeros((n/12,c*(w-2)))
    temp = np.zeros((12,w-3))
    for i in range(n/12):
        temp = get_patient_matrix(raw_data,i)
        for j in range (w-3):
            temp[:,j] = interp(temp[:,j])

        res[i][0] = raw_data[i * 12][2] /100
        # print(patient_pca(temp,c))
        res[i][1:c*w-2] = patient_pca(temp,c)
    return res

### Combining functions
1. clean_set
- flatten_min_max_slope

In [12]:
def clean_set(nds,labels,imp_method,time_method,sequence):
    """
    Given a dataset containing data on consecutive hours outputs a row extracting information time features
    Parameters:
    Input nds - numpy dataset
    Input labels - list labels of dataset
    Input imp_method - time in hours to compress data 
    Input time_method - method of reduction to use
    Input sequence - method of reduction to use
    Output  - dataset compressed 
    """
    ds_clean = nds
    if sequence:
        ds_clean = time_reduction(ds_clean, labels, 12,time_method)
        ds_clean = nan_imputer(ds_clean,imp_method)
        ds_clean = pd.DataFrame(ds_clean, columns=labels)
    else:
        ds_clean = nan_imputer(ds_clean,imp_method)    
        ds_clean = time_reduction(ds_clean, labels, 12,time_method)
        ds_clean = pd.DataFrame(ds_clean, columns=labels)
    
        
    return ds_clean

In [13]:
def flatten_min_max_slope(raw_data):
    n,w =  raw_data.shape
    c = w - 3
    means = np.nanmean(raw_data,axis=0)
    ndiv = int(n/12)
    res = np.zeros((ndiv,1 + c * 3))
    temp = np.zeros((12,c))
    for i in range(ndiv):
        temp = get_patient_matrix(raw_data,i)
        for j in range (c):
            temp[:,j] = interp(temp[:,j])
        res[i][0] = raw_data[i * 12][2]
        for j in range (c):
            min = np.min(temp[:,j])
            max = np.max(temp[:,j])
            # if(min == 0):
            #     min = means[j + 3]
            # if(max == 0):
            #     max = means[j + 3]
            res[i][j*3+1] = min
            res[i][j*3+2] = max
            # res[i][j*3 + 2] = 0
            res[i][j*3+3] = (max-min)/12
    # print(res[0:5])
    return res

## Model choosing functions

In [14]:
def data_fold_10(nds, f_nr, task):
    """
    Given a dataset, outputs two subsets: training set and test set. Test sets is given by the f_nr-th partition of the dataset,
    meanwhile the training set is the remaining of the dataset
    Parameters:
    Input ds - numpy dataset to partition
    Input f_nr - number of the fold that will be the test set 1-10
    Output (testset, trainingset) - tuple containing the test set and training set
    """
    dss = np.array_split(nds,10,axis=0) #dataset split
    testset = dss[f_nr] #test set
    if task==2:
        trainingset = np.hstack(np.delete(dss, f_nr, 0)) #training set 
    else:
        trainingset = np.vstack(np.delete(dss, f_nr, 0)) #training set
    return (testset, trainingset)

In [15]:
def fold10_predict(models, ndsx, ndsy, ndsy_L, task):
    """
    Do 10 fold cross validation on training set for one of the three subtasks
    Parameters:
    Input models - list of models 
    Input ndsx - training features
    Input ndsy - training labels
    Input ndsy_L - training labels headers
    Input task - subtask to fold
    Output nscores - list containing the scores of each fold
    """
    ###Performing 10-fold Cross Validation for each Model
    mlen = len(models) #Number of models
    nscores = np.zeros((mlen,10)) #score of each fold
    for j in range(10):
        #Creating test set and training set from data set 
        if task==2:
            (tes_x, trs_x) = data_fold_10(ndsx, j, task-1)
            (tes_y, trs_y) = data_fold_10(ndsy, j, task)
        else:
            (tes_x, trs_x) = data_fold_10(ndsx, j, task)
            (tes_y, trs_y) = data_fold_10(ndsy, j, task)

        #Perform fitting and predicting for each model
        for i in range(mlen):
            models[i].fit(trs_x,trs_y)
            if task==3:#if task is third we use predict
                tes_yp = models[i].predict(tes_x)

                #Transform into DataFrame for scoring
                df_y = pd.DataFrame(tes_y, columns=ndsy_L)
                df_yp = pd.DataFrame(tes_yp, columns=ndsy_L)
                nscores[i,j] = scores(df_y,df_yp, task)
            else:
                tes_yp = predict_sigmoid(models[i],tes_x)
                #Transform into DataFrame for scoring
                df_y = pd.DataFrame(tes_y, columns=ndsy_L)
                if task==2:
                    df_yp = pd.DataFrame(tes_yp, columns=ndsy_L)
                else:
                    df_yp = pd.DataFrame(tes_yp, columns=ndsy_L)
                nscores[i,j] = scores(df_y,df_yp, task)
    return nscores

In [16]:
def scores(tes_y, tes_yp, task):
    """
    Give score for one of the subtask
    Parameters:
    Input tes_y - training labels
    Input tes_yp - predicted labels
    Input task - subtask
    Output score - score of the subtask
    """
    if task==3:
        VITALS = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
        score = np.mean([0.5 + 0.5 * np.maximum(0, metrics.r2_score(tes_y[entry], tes_yp[entry])) for entry in VITALS])
    elif task==2:
        score = metrics.roc_auc_score(tes_y['LABEL_Sepsis'], tes_yp['LABEL_Sepsis'])
    else:
        TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
        score = np.mean([metrics.roc_auc_score(tes_y[entry], tes_yp[entry]) for entry in TESTS])
    
    return score

In [17]:
def sigmoid(x):
    return 1/(1 + math.exp(-x))

In [18]:
def map_sigmoid(X):
    xshape = X.shape
    if len(xshape)==1:
        n = xshape[0]
        for i in range(n):
            X[i] = sigmoid(X[i])
    else:
        n,m = xshape
        for i in range(n):
            for j in range(m):
                X[i][j] = sigmoid(X[i][j])
    return X

In [19]:
def predict_sigmoid(clf,data):
    return map_sigmoid(clf.decision_function(data))

# Subtasks - Setup

In [20]:
## Setup for all subtasks

#Extracting training labels and features
dataset_y = pd.read_csv("train_labels.csv")
dataset_x = pd.read_csv("train_features.csv")
raw_data = np.genfromtxt("./train_features.csv",delimiter=",",skip_header=1)


#lists that contain header of dataset
dataset_x_L = list(dataset_x) 

#Standard Scaler of dataset 
scaler = StandardScaler()
#scaler = MinMaxScaler()
#scaler = RobustScaler()
scaler.fit(dataset_x)
scaled_data = scaler.transform(dataset_x)
scaled_data = pd.DataFrame(scaled_data,columns=dataset_x_L)

sfmms_data = scale(flatten_min_max_slope(raw_data))

In [21]:
#subtask 1
#Training labels
dataset_y1 = dataset_y.loc[:,"LABEL_BaseExcess":"LABEL_EtCO2"] #Labels to be predicted in [0,1] range
ds_y1_L = list(dataset_y1) #headers of labels
ndsy1 = dataset_y1.to_numpy() #to numpy

#Training Features
#Cleaning dataset in order to remove NaNs and reduce dimensionality
cs1 = clean_set(scaled_data,dataset_x_L,1,1,True)
##Division for the prediction, probabilities divided from the real values
ds_p1 = cs1.loc[:,"Time":"pH"] #reduced dataset for prediction, without pid

#labels of datasets
ds_p1_L = list(ds_p1)

#transform into numpy
nds_p1 = ds_p1.to_numpy()
mnds_p1 = sfmms_data

In [22]:
#subtask 2
#Training labels
dataset_y2 = dataset_y.loc[:,"LABEL_Sepsis"] #Labels to be predicted in [0,1] range
ds_y2_L = ["LABEL_Sepsis"] #headers of labels
ndsy2 = dataset_y2.to_numpy() #to numpy

#Training Features
cs2 = clean_set(scaled_data,dataset_x_L,1,1,True)
ds_p2 = cs2.loc[:,"Time":"pH"] #reduced dataset for prediction, without pid
ds_p2_L = list(ds_p2)
nds_p2 = ds_p2.to_numpy()
mnds_p2 = sfmms_data

In [23]:
#subtask 3
#Training labels
dataset_y3 = dataset_y.loc[:,"LABEL_RRate":"LABEL_Heartrate"] #Labels to be predicted in [0,1] range
ds_y3_L = list(dataset_y3) #headers of labels
ndsy3 = dataset_y3.to_numpy() #to numpy

#Training Features
cs3 = clean_set(scaled_data,dataset_x_L,1,1,True)
ds_p3 = cs3.loc[:,"Time":"pH"] #reduced dataset for prediction, without pid
ds_p3_L = list(ds_p3)
nds_p3 = ds_p3.to_numpy()
mnds_p3 = sfmms_data

# Subtask1 training models

In [24]:
#Model set used for training
svcms = SVC(kernel='linear', 
            decision_function_shape='ovr', 
            gamma='auto', 
            probability=True, 
            max_iter=1000)
matt_svcms = svm.SVC(kernel='linear',
                     decision_function_shape='ovr', 
                     C=0.01,
                     max_iter=10000)
models1 = []
#1 - Sklearn: OnveVsRestClassifier using svcms
#models1 = np.append(models1,OneVsRestClassifier(svcms))
#2 - Sklearn: OneVsOneClassifier using svcms
#models1 = np.append(models1,OneVsOneClassifier(svcms))
#3 - Sklearn: OnveVsRestClassifier using svcms
models1 = np.append(models1,OneVsRestClassifier(matt_svcms))
#4 - Sklearn: OneVsOneClassifier using svcms
#models1 = np.append(models1,OneVsOneClassifier(matt_svcms))

In [25]:
#Get Predictions
s1 = fold10_predict(models1,mnds_p1,ndsy1,ds_y1_L,1)
pd.DataFrame(s1)







Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.75848,0.751267,0.75307,0.744522,0.736351,0.747362,0.745913,0.741246,0.727256,0.755588


In [26]:
avg_s1 = np.average(s1,axis=1)
pd.DataFrame(avg_s1)

Unnamed: 0,0
0,0.746106


# Subtask2 - training models

In [27]:
#Model list used for predictions
models2 = []
matt_svcms = svm.SVC(kernel='sigmoid',
                     decision_function_shape='ovr', 
                     C=0.01,
                     max_iter=10000)
#0 - Sklearn: SVC 
#models2 = np.append(models2,SVC(kernel='sigmoid',
#                                gamma='auto',
#                                probability=True,
#                                max_iter=1000))
#1 - Sklearn: SVR
#models2 = np.append(models2,matt_svcms)
#from sklearn.svm import LinearSVC
models2 = np.append(models2,sklearn.svm.LinearSVC(random_state=0,dual=False, tol=1e-10,max_iter=10000))

In [28]:
#Get predictions
s2 = fold10_predict(models2,mnds_p2,ndsy2,ds_y2_L,2)

In [29]:
pd.DataFrame(s2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.658932,0.713951,0.724796,0.688253,0.735791,0.711369,0.730331,0.742531,0.691106,0.683634


In [30]:
avg_s2 = np.average(s2,axis=1)
pd.DataFrame(avg_s2)

Unnamed: 0,0
0,0.708069


# Subtask3 - training models
I included only ridge as it is relatively fast and produces ~0.745

In [31]:
##Model set used for training
models3 = []
#0 - Sklearn: Ridge regression function with alpha 1
models3 = np.append(models3, linear_model.RidgeCV(alphas=[10*a for a in range(1,10)],
                                                 fit_intercept=True,
                                                 gcv_mode='svd'
                                                 )) 
#1 - Sklearn: Multi Task Lasso Cross Validation on alphas
#models3 = np.append(models3,linear_model.MultiTaskLassoCV(cv=10,
#                                                          alphas=[10**a for a in range(-1,10)],
#                                                          fit_intercept=True,
#                                                          max_iter=1000))
#2 Sklearn: Multitask Elastic Net with Cross Validation
#models3 = np.append(models3,linear_model.MultiTaskElasticNetCV(cv=10,random_state=0,max_iter=10000))
#models3 = np.append(models3,OneVsRest(linear_model.SGDRegressor()))

In [32]:
#Get predictions
s3 = fold10_predict(models3, nds_p3, ndsy3, ds_y3_L, 3)

In [33]:
pd.DataFrame(s3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.748734,0.746099,0.74074,0.74454,0.732383,0.767828,0.743956,0.732851,0.749092,0.755045


In [34]:
avg_s3 = np.average(s3,axis=1)
pd.DataFrame(avg_s3)

Unnamed: 0,0
0,0.746127


In [35]:
np.average([avg_s1, avg_s2, avg_s3])

0.7334338851901423

# Testset prediction - in progress

In [38]:
#Dummy code to combine three results
best_1 = models1[0]#Dummy
best_2 = models2[0]#Dummy
best_3 = models3[0]#Dummy

#training best model on entire dataset
best_1.fit(mnds_p1,ndsy1)
best_2.fit(mnds_p2,ndsy2)
best_3.fit(nds_p3,ndsy3)

#extract dataset to predict
testset_x = pd.read_csv("test_features.csv")
testset_x_L = list(testset_x)
test_x = testset_x.to_numpy()

#Standard Scaler of dataset 
scaler = StandardScaler()
scaler.fit(testset_x)
scaled_test = scaler.transform(testset_x)
scaled_test = pd.DataFrame(scaled_test,columns=dataset_x_L)

test_raw = np.genfromtxt("./test_features.csv",delimiter=",",skip_header=1)

#cleaning data
test12_x = scale(flatten_min_max_slope(test_raw))
#test2_x = clean_set(test_x,testset_x_L,1,1,True)
test3_x = clean_set(scaled_test,testset_x_L,1,1,True)

#reduced dataset for prediction, without pid
#ctes1 = test1_x.loc[:,"Time":"pH"] 
#ctes2 = test2_x.loc[:,"Time":"pH"]
ctes3 = test3_x.loc[:,"Time":"pH"]



In [39]:
#prediction using best models for each subtask
pred1 = predict_sigmoid(best_1,test12_x)
pred2 = predict_sigmoid(best_2,test12_x)
pred3 = best_3.predict(ctes3)

In [44]:
#Conversion to df
pred1 = pd.DataFrame(pred1,columns=ds_y1_L)
pred2 = pd.DataFrame(pred2,columns=ds_y2_L)
pred3 = pd.DataFrame(pred3,columns=ds_y3_L)

In [45]:
#Adding the pids and assemble final prediction
pids = time_reduction(testset_x,list(testset_x),12,1)
pd.DataFrame(pids)
pids = pd.DataFrame(pids[:,0],columns=['pid'])
pred = pd.concat([pids.astype(int),pred1,pred2, pred3], axis=1)
pred

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.884105,0.268751,0.999823,0.999745,0.999934,0.275473,0.264228,0.208356,0.266602,0.250971,0.336695,14.090522,82.107640,98.610515,83.488713
1,10001,0.285673,0.268605,0.271105,0.268496,0.273318,0.275753,0.268795,0.269316,0.269582,0.253968,0.283785,18.245793,88.779223,94.906971,102.966801
2,10003,0.286959,0.268527,0.267113,0.270399,0.267692,0.264925,0.268274,0.476533,0.267850,0.250788,0.272699,18.694166,83.906469,97.814410,91.725402
3,10004,0.138299,0.268554,0.261414,0.259220,0.257889,0.269630,0.268992,0.244974,0.268761,0.259458,0.278446,16.554891,72.649804,95.798655,88.254371
4,10005,0.210558,0.268519,0.265864,0.264186,0.266400,0.268360,0.268489,0.261849,0.268877,0.227244,0.285449,19.317833,75.179280,95.986274,61.304061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,9989,0.452198,0.268557,0.270673,0.264436,0.266971,0.281353,0.267546,0.269880,0.268961,0.230196,0.329972,19.685199,80.626953,95.701925,103.366045
12660,9991,0.532191,0.268451,0.266830,0.259680,0.266712,0.261873,0.265663,0.105029,0.268277,0.238321,0.316115,18.281019,93.856425,98.708335,75.464507
12661,9992,0.541797,0.268484,0.267771,0.265269,0.262578,0.243247,0.267215,0.417678,0.266818,0.239790,0.282550,18.663326,70.735754,97.412140,83.687539
12662,9994,0.994535,0.268481,0.739315,0.684714,0.720919,0.990964,0.263836,0.946843,0.266389,0.221694,0.424816,16.031283,85.844212,98.422374,97.363627


In [42]:
#combine predictions into one dataframe
#output, 3 digit floats
pred.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')

In [43]:
#check format
df_submission = pd.read_csv('prediction.zip')
df_sample = pd.read_csv('sample.zip')
df_submission.shape == df_sample.shape

True

# Not used functions, I might apply them later

In [None]:
def missing_values(raw):
    n,m = raw.shape
    means = np.nanmean(raw,axis=0)
    stds = np.nanstd(raw,axis=0)
    for j in range(m):
        for i in range(n):
            if (math.isnan(raw[i][j])):
                # for mean:
                raw[i][j] = means[j]
                #for zero
                # raw[i][j] = 0
        scaler = StandardScaler(copy=False)
        scaler.fit(raw)
        raw = scaler.transform(raw)
    return raw

In [None]:
def flatten(raw_data):
    n,w = raw_data.shape
    res = np.zeros((n/12, 1 + 12 * (w - 2)))
    tw = w-2
    for i in range(res.shape[0]):
        res[i][0] = raw_data[i * 12][2]
        temp = np.zeros((12 * tw))
        for j in range(12):
            res[i][1 + j * tw] = raw_data[i*12 + j][1]
            res[i][(1 + j * tw + 1) :  (1 + (j+1) * tw)  ]  = raw_data[i*12+j][3:w]
    return res

In [None]:
def setup():
        raw_data = np.genfromtxt("./train_features.csv",delimiter=",",skip_header=1)
        # raw_data = missing_values(raw_data)
        # samples = flatten_pca(raw_data,1)
        samples = scale(flatten_min_max_slope(raw_data))
        raw_labels = np.genfromtxt("./train_labels.csv",delimiter=",",skip_header=1)
        labels = raw_labels[:,1:12]
        return samples,labels

In [None]:
def model():
    return OneVsRestClassifier(svm.SVC(
                                kernel='linear',
                                decision_function_shape='ovr',
                                C=0.01,
                                max_iter=10000))

In [None]:
def predict_csv(clf):
    raw_data = np.genfromtxt("./train_features.csv",delimiter=",",skip_header=1)
    raw_data = missing_values(raw_data)
    data = flatten(raw_data)
    return predict(data,clf)
    return restructure_predict(clf.predict_proba(data))

In [None]:
def get_roc(samples,labels):
    return cross_val_score(model(),samples,labels, scoring='roc_auc',cv=5)


In [None]:
def main():
    set_up_start = time.time()
    samples,labels = setup()
    n,m = samples.shape
    print("--------------------------------\ndone setup")
    train_start = time.time()
    setup_duration = train_start - set_up_start
    print ("Preprocessing takes: %.1f seconds\n--------------------------------"  % setup_duration)
    roc = get_roc(samples,labels)
    print(roc)
    print("--------------------------------\ndone training")
    scoring_start = time.time()
    train_duration = scoring_start - train_start
    print("Training takes: %.1f seconds\n--------------------------------"  % train_duration)