In [23]:
# Imports
import pandas as pd
import numpy as np
import random
from random import shuffle
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.metrics import roc_auc_score, make_scorer
import sklearn.metrics as metrics
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_columns', None)


from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

from sklearn.ensemble import RandomForestRegressor

In [2]:
# helper functions

def get_unique_pids(featuresdf):
    return featuresdf['pid'].unique()

# aggregate data for all features
def aggregate_all(featuresdf):
    cc = featuresdf.groupby(['pid']).cumcount()+1
    featuresdf = featuresdf.set_index(['pid', cc]).unstack()
    featuresdf.columns = ['_'.join(map(str,i)) for i in featuresdf.columns]
    featuresdf = featuresdf.reset_index()
    multiples = ['Age']
    vitals = ['Heartrate', 'SpO2', 'ABPs', 'ABPm', 'ABPd', 'RRate', 'Temp']
    stats = ['mean', 'std', 'min', 'max']
    for i in range(1,12):
        stats.append('pct_' + str(i))
    for vital in vitals:
        for stat in stats:
            multiples.append(vital + '_' + stat)
    for multiple in multiples:
        for i in range(2,13):
            del featuresdf[multiple + '_' + str(i)]
    return featuresdf

def featureselect(featuresdf, featurenames):
    pids = get_unique_pids(featuresdf)
    featurenamesnr = []
    for name in featurenames:
        for i in range(1,13):
            featurenamesnr.append(name + '_' + str(i))
    newfeatures = pd.DataFrame(columns = featurenamesnr)
    for pid in pids:
        newfeatures = newfeatures.append(featuresdf.loc[featuresdf['pid'] == pid][featurenamesnr])
    return newfeatures

def calc_distribution(featuresdf):
    means = pd.DataFrame()
    stds = pd.DataFrame()
    for column in featuresdf:
        means[column] = 0
        stds[column] = 0
    pids = get_unique_pids(featuresdf)
    for currentpid in pids:
        currentfeatures = featuresdf.loc[featuresdf['pid'] == currentpid]
        means = means.append(currentfeatures.mean(), ignore_index=True)
        stds = stds.append(currentfeatures.std(), ignore_index=True)
    means = means.mean()
    stds = stds.mean()
    return means, stds

def impute(featuresdf, impute_strategy):
    pids = get_unique_pids(featuresdf)
    #means, stds = calc_distribution(featuresdf)
    
    featurenames = []
    for column in featuresdf:
        featurenames.append(column)
    
    medicalvalues = [-1, -1, -1, 40, 29, 16, 1.3, 37, 15, 24, 0, 15, 300, 3.5, 7, 0.75, 40, 26, 0.6, 275, 97, 100, 85, 1.7, 4.4, 70, 9.45, 80, 97.5, 0.1, 101, 45, 80, 0.5, 0.02, 105, 7.4]

        
    # interpolate for patients that have at least one value for that test
    for currentpid in pids:
        currentfeatures = featuresdf.loc[featuresdf['pid'] == currentpid]
        featuresdf.loc[featuresdf['pid'] == currentpid] = currentfeatures.interpolate(limit_direction='both')
    
    # fill for patients without any value for that test
    if(impute_strategy == 'random_mean'):
        for column in featurenames:
            featuresdf[column] = featuresdf[column].fillna(pd.Series([np.random.normal(means[column],stds[column]) for x in range(len(featuresdf))]))
    elif (impute_strategy == 'mean'):
        featuresdf = featuresdf.fillna(featuresdf.mean())
    elif (impute_strategy == 'real'):
        featuresdf = featuresdf.fillna((dict(zip(featuresdf.columns.tolist(), medicalvalues))))

    
    return featuresdf

def normalize(featuresdf):
    return pd.concat([featuresdf.iloc[:,:1],(featuresdf.iloc[:,1:]-featuresdf.iloc[:,1:].mean())/featuresdf.iloc[:,1:].std()], axis=1)
     

def split(featuresdf, labelsdf, ratio, randomized):
    pids = get_unique_pids(featuresdf).tolist()
    if(randomized):
        shuffle(pids)
    train_pids = pids[:int(ratio*len(pids))]
    test_pids = pids[int(ratio*len(pids)):]
    
    X_train = featuresdf.iloc[0:0]
    Y_train = labelsdf.iloc[0:0]
    X_test = featuresdf.iloc[0:0]
    Y_test = labelsdf.iloc[0:0]
    
    for pid in train_pids:
        X_train = X_train.append(featuresdf.loc[featuresdf['pid'] == pid])
        Y_train = Y_train.append(labelsdf.loc[labelsdf['pid'] == pid])
        
    for pid in test_pids:
        X_test = X_test.append(featuresdf.loc[featuresdf['pid'] == pid])
        Y_test = Y_test.append(labelsdf.loc[labelsdf['pid'] == pid])
        
    X_train = X_train.sort_values('pid')
    Y_train = Y_train.sort_values('pid')
    X_test = X_test.sort_values('pid')
    Y_test = Y_test.sort_values('pid')

    X_train = X_train.iloc[:,1:]
    X_test = X_test.iloc[:,1:]
    
    return X_train, Y_train, X_test, Y_test

def medtest_features(featuresdf):
    medicaltests = ['EtCO2', 'PTT', 'BUN', 'Lactate', 'Hgb', 'HCO3', 'BaseExcess', 'Fibrinogen', 'Phosphate', 'WBC', 'Creatinine', 'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose', 'Magnesium', 'Potassium', 'Calcium', 'Alkalinephos', 'Bilirubin_direct', 'Chloride', 'Hct', 'Bilirubin_total', 'TroponinI', 'pH']
    
    # add boolean value for each test as new feature
    for medicaltest in medicaltests:
        featuresdf[medicaltest + 'Test'] = featuresdf[medicaltest].notnull().astype('int')
    
    return featuresdf


def vital_features(featuresdf):
    vitals = ['Heartrate', 'SpO2', 'ABPs', 'ABPm', 'ABPd', 'RRate', 'Temp']
    vitalvalues = [80, 97.5, 105, 85, 70, 15, 37]
    for vital in vitals:
        featuresdf[vital + '_mean'] = 0
        featuresdf[vital + '_std'] = 0
        featuresdf[vital + '_min'] = 0
        featuresdf[vital + '_max'] = 0
        for i in range(1,12):
            featuresdf[vital + '_pct_' + str(i)] = 0
    pids = get_unique_pids(featuresdf)
    
    for pid in pids:
        currentfeatures = featuresdf.loc[featuresdf['pid'] == pid]
        for vital in vitals:
            featuresdf.loc[featuresdf['pid'] == pid, vital+'_mean'] = currentfeatures[vital].mean()
            featuresdf.loc[featuresdf['pid'] == pid, vital+'_std'] = currentfeatures[vital].std()
            featuresdf.loc[featuresdf['pid'] == pid, vital+'_min'] = currentfeatures[vital].min()
            featuresdf.loc[featuresdf['pid'] == pid, vital+'_max'] = currentfeatures[vital].max()
            for i in range(1,12):
                featuresdf.loc[featuresdf['pid'] == pid, vital +'_pct_' + str(i)] = currentfeatures[vital].pct_change(fill_method='bfill').iloc[i]
    
    return featuresdf

In [3]:
# Read Input
max_patients = 18995
num_patients = 18995
features = pd.read_csv('train_features.csv', nrows=num_patients*12)
labels = pd.read_csv('train_labels.csv', nrows=num_patients)

In [4]:
#features.info()
#features.describe()
#features.isnull().sum()

In [5]:
#features_import = pd.read_csv('features_impreal.csv')

In [6]:
#features_import

In [33]:
features_impreal = aggregate_all(normalize(vital_features(impute((medtest_features(features)), 'real'))))

In [348]:
#features_impreal.to_csv('features_impreal.csv')

In [9]:
X_train, Y_train, X_test, Y_test = split(features_impreal, labels, 1, False)

In [363]:
########################
## TRAINING SUBTASK 1 ##
########################

In [None]:
predictions = ['BaseExcess', 'Fibrinogen', 'AST', 'Alkalinephos', 'Bilirubin_total', 'Lactate', 'TroponinI', 'SaO2', 'Bilirubin_direct', 'EtCO2']

In [11]:
clf_BaseExcess = RandomForestClassifier(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_BaseExcess)
clf_Fibrinogen = RandomForestClassifier(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_Fibrinogen)
clf_AST = RandomForestClassifier(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_AST)
clf_Alkalinephos = RandomForestClassifier(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_Alkalinephos)
clf_Bilirubin_total = RandomForestClassifier(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_Bilirubin_total)
clf_Lactate = RandomForestClassifier(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_Lactate)
clf_TroponinI = RandomForestClassifier(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_TroponinI)
clf_SaO2 = RandomForestClassifier(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_SaO2)
clf_Bilirubin_direct = RandomForestClassifier(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_Bilirubin_direct)
clf_EtCO2 = RandomForestClassifier(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_EtCO2)

In [24]:
#roc_BaseExcess = roc_auc_score(Y_test.LABEL_BaseExcess, clf_BaseExcess.predict_proba(X_test)[:,1])
#roc_Fibrinogen = roc_auc_score(Y_test.LABEL_Fibrinogen, clf_Fibrinogen.predict_proba(X_test)[:,1])
#roc_AST = roc_auc_score(Y_test.LABEL_AST, clf_AST.predict_proba(X_test)[:,1])
#roc_Alkalinephos = roc_auc_score(Y_test.LABEL_Alkalinephos, clf_Alkalinephos.predict_proba(X_test)[:,1])
#roc_Bilirubin_total = roc_auc_score(Y_test.LABEL_Bilirubin_total, clf_Bilirubin_total.predict_proba(X_test)[:,1])
#roc_Lactate = roc_auc_score(Y_test.LABEL_Lactate, clf_Lactate.predict_proba(X_test)[:,1])
#roc_TroponinI = roc_auc_score(Y_test.LABEL_TroponinI, clf_TroponinI.predict_proba(X_test)[:,1])
#roc_SaO2 = roc_auc_score(Y_test.LABEL_SaO2, clf_SaO2.predict_proba(X_test)[:,1])
#roc_Bilirubin_direct = roc_auc_score(Y_test.LABEL_Bilirubin_direct, clf_Bilirubin_direct.predict_proba(X_test)[:,1])
#roc_EtCO2 = roc_auc_score(Y_test.LABEL_EtCO2, clf_EtCO2.predict_proba(X_test)[:,1])
#print('BaseExcess: ' + str(roc_BaseExcess))
#print('Fibrinogen: ' + str(roc_Fibrinogen))
#print('AST: ' + str(roc_AST))
#print('Alkalinephos: ' + str(roc_Alkalinephos))
#print('Bilirubin_total: ' + str(roc_Bilirubin_total))
#print('Lactate: ' + str(roc_Lactate))
#print('TroponinI: ' + str(roc_TroponinI))
#print('SaO2: ' + str(roc_SaO2))
#print('Bilirubin_direct: ' + str(roc_Bilirubin_direct))
#print('EtCO2: ' + str(roc_EtCO2))
#print('Total: ' + str((roc_BaseExcess+roc_Fibrinogen+roc_AST+roc_Alkalinephos+roc_Bilirubin_total+roc_Lactate+roc_TroponinI+roc_SaO2+roc_Bilirubin_direct+roc_EtCO2)/10))

In [90]:
###################################

In [345]:
########################
## TRAINING SUBTASK 2 ##
########################

In [12]:
clf_sepsis = RandomForestClassifier(n_estimators = 1000, min_samples_split = 12, min_samples_leaf = 7, max_features = 65, max_depth=90, bootstrap = False).fit(X_train,Y_train.LABEL_Sepsis)

In [479]:
########################
## TRAINING SUBTASK 3 ##
########################

In [14]:
clf_RRate = RandomForestRegressor(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_RRate)
clf_ABPm = RandomForestRegressor(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_ABPm)
clf_SpO2 = RandomForestRegressor(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_SpO2)
clf_Heartrate = RandomForestRegressor(n_estimators = 1000, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth=100, bootstrap = False).fit(X_train,Y_train.LABEL_Heartrate)

In [95]:
#clf_RRate = linear_model.Lasso(alpha=0.1, max_iter=10000).fit(X_train,Y_train.LABEL_RRate)
#clf_ABPm = linear_model.Lasso(alpha=0.01, max_iter=10000).fit(X_train,Y_train.LABEL_ABPm)
#clf_SpO2 = linear_model.Lasso(alpha=0.001, max_iter=10000).fit(X_train,Y_train.LABEL_SpO2)
#clf_SpO2 = SVR().fit(X_train, Y_train.LABEL_SpO2)
#clf_Heartrate = linear_model.Lasso(alpha=0.1, max_iter=10000).fit(X_train,Y_train.LABEL_Heartrate)

In [96]:
#roc_RRate = 0.5 + 0.5 * np.maximum(0, metrics.r2_score(Y_test.LABEL_RRate, clf_RRate.predict(X_test)))
#roc_ABPm = 0.5 + 0.5 * np.maximum(0, metrics.r2_score(Y_test.LABEL_ABPm, clf_ABPm.predict(X_test)))
#roc_SpO2 = 0.5 + 0.5 * np.maximum(0, metrics.r2_score(Y_test.LABEL_SpO2, clf_SpO2.predict(X_test)))
#roc_Heartrate = 0.5 + 0.5 * np.maximum(0, metrics.r2_score(Y_test.LABEL_Heartrate, clf_Heartrate.predict(X_test)))
#print('RRate: ' + str(roc_RRate))
#print('ABPm: ' + str(roc_ABPm))
#print('SpO2: ' + str(roc_SpO2))
#print('Heartrate: ' + str(roc_Heartrate))
#print('Total: ' + str((roc_RRate+roc_ABPm+roc_SpO2+roc_Heartrate)/4))

RRate: 0.7161671383389238
ABPm: 0.8078344537125959
SpO2: 0.6883924506904575
Heartrate: 0.823426938800143
Total: 0.7589552453855299


In [None]:
###############################
###############################
###############################

In [None]:
#################
## SUBMISSION ##
################

In [15]:
test_features = pd.read_csv('test_features.csv')
submission = test_features[['pid']].iloc[::12]

In [16]:
test_features_impreal = aggregate_all(normalize(vital_features(impute((medtest_features(test_features)), 'real'))))

In [17]:
test_features_impreal['pid'] = test_features_impreal['pid'].astype(str)
test_features_impreal = test_features_impreal.sort_values('pid')
test_features_impreal = test_features_impreal.iloc[:,1:]

In [18]:
#X_train, Y_train, X_test, Y_test = split(features_impmean, labels, 1, False)

In [19]:
# run predictions for test data and fill in submission
submission['LABEL_BaseExcess'] = clf_BaseExcess.predict_proba(test_features_impreal)[:,1]
submission['LABEL_Fibrinogen'] = clf_Fibrinogen.predict_proba(test_features_impreal)[:,1]
submission['LABEL_AST'] = clf_AST.predict_proba(test_features_impreal)[:,1]
submission['LABEL_Alkalinephos'] = clf_Alkalinephos.predict_proba(test_features_impreal)[:,1]
submission['LABEL_Bilirubin_total'] = clf_Bilirubin_total.predict_proba(test_features_impreal)[:,1]
submission['LABEL_Lactate'] = clf_Lactate.predict_proba(test_features_impreal)[:,1]
submission['LABEL_TroponinI'] = clf_TroponinI.predict_proba(test_features_impreal)[:,1]
submission['LABEL_SaO2'] = clf_SaO2.predict_proba(test_features_impreal)[:,1]
submission['LABEL_Bilirubin_direct'] = clf_Bilirubin_direct.predict_proba(test_features_impreal)[:,1]
submission['LABEL_EtCO2'] = clf_EtCO2.predict_proba(test_features_impreal)[:,1]
submission['LABEL_Sepsis'] = clf_sepsis.predict_proba(test_features_impreal)[:,1]
submission['LABEL_RRate'] = clf_RRate.predict(test_features_impreal)
submission['LABEL_ABPm'] = clf_ABPm.predict(test_features_impreal)
submission['LABEL_SpO2'] = clf_SpO2.predict(test_features_impreal)
submission['LABEL_Heartrate'] = clf_Heartrate.predict(test_features_impreal)

In [20]:
submission.to_csv('submission.zip', index=False, float_format='%.3f', compression=dict(method='zip', archive_name='submission.csv'))