In [21]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
from scipy.stats import zscore
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [None]:
#read in the data sets
df_train = pd.read_csv('train_features.csv')
df_train_labels = pd.read_csv('train_labels.csv')
df_test = pd.read_csv('test_features.csv')

# Join time-data for patients

In [5]:
"""
for a given pit number this function calculates features for each row in the given dataframe and returns a long dataframe with all the features 
"""

def feature_creation(pid,df):
    
    patient = df.loc[df['pid'] == pid]

    #second store average into a new dataframe
    mean = pd.DataFrame(patient.mean()).transpose()#mean for each feature
    mean = mean.drop('Time',axis = 1)#delete time from mean_1

    #add new features
    maxi = pd.DataFrame(patient.iloc[:,3:].max()).transpose()#maximum
    mini = pd.DataFrame(patient.iloc[:,3:].min()).transpose()#maximum
    kurt = pd.DataFrame(patient.iloc[:,3:].kurtosis()).transpose()#kurtosis
    mad = pd.DataFrame(patient.iloc[:,3:].mad()).transpose()#mean absolute deviation
    skew = pd.DataFrame(patient.iloc[:,3:].skew()).transpose()#skewedness
    std = pd.DataFrame(patient.iloc[:,3:].std()).transpose()#standard deviation
    nans = pd.DataFrame(mean.isnull().values.astype(int),columns = mean.isnull().columns)#1 if all entreis are 0 else 0
    
    #concatenate all features
    total = pd.concat([mean,maxi,mini,kurt,mad,skew,std,nans], axis = 1, ignore_index = True)
    return total



In [6]:
#extract the unique pid numbers in the correct order
_, idx = np.unique(df_train.values[:, 0], return_index=True)
pids = df_train.values[:, 0]
pids = pids[np.sort(idx)]

#initialize final dataframe
df_all = feature_creation(pids[0],df_train)

#loop over all patient pids
for i in tqdm(range(1,pids.size)):
    #for each pid call the function feature creation
    new_data = feature_creation(pids[i],df_train)
    #concatenate the rows for all patients to a new dataframe
    df_all = pd.concat([df_all,new_data], axis=0, ignore_index=True)


100%|████████████████████████████████████| 18994/18994 [01:23<00:00, 228.28it/s]


In [8]:
#do the same operations as in the cell above for the training set!
_, idx2 = np.unique(df_test.values[:, 0], return_index=True)
pids_2 = df_test.values[:, 0]
pids_2 = pids_2[np.sort(idx2)]
df_all_test = feature_creation(pids_2[0],df_test)

for i in tqdm(range(1,pids_2.size)):
    new_data_2 = feature_creation(pids_2[i],df_test)
    df_all_test = pd.concat([df_all_test,new_data_2], axis=0, ignore_index=True)

100%|████████████████████████████████████| 12663/12663 [00:51<00:00, 244.86it/s]


# Data Pre-Processing
In order to preprocess the data we want to take the follwing steps

1. **fix missing values**
    - replace missing values by the mean of the data in this row 
    - note that the missing values could also be replaced by other measueres such as the median
  
  
2. **standardize data**
    - fix heavy tailed distribution
    - distribute arround 0 with standard deviation 1
    - we can also additionaly normalize the data which is usefull in some cases(also in ours?)


# 1. Missing values

In [None]:
#prints the percentage of missing values for each row of the data frame
#print(df_all.isna().sum()/len(df_all)*100)

In [11]:
#this loops over the columns names
"""
#Find the missing percentage of each column in the training set.
def find_missing_percent(data):
    
    #Returns dataframe containing the total missing values and percentage of total
    #missing values of a column.
    
    miss_df = pd.DataFrame({'ColumnName':[],'TotalMissingVals':[],'PercentMissing':[]})
    for col in data.columns:
        sum_miss_val = data[col].isna().sum()
        percent_miss_val = round((sum_miss_val/data.shape[0])*100,2)
        miss_df = pd.concat([miss_df,pd.DataFrame({'ColumnName': [col],'TotalMissingVals': [sum_miss_val],'PercentMissing': [percent_miss_val]})])
    return miss_df

miss_df = find_missing_percent(df_train)
"""
    
#Find the missing percentage of each column in the training set.
#we cannot reference columns by name as we have multiple columns named the same
def find_missing_percent(data):
    """
    Returns dataframe containing the total missing values and percentage of total
    missing values of a column.
    """
    miss_df = pd.DataFrame({'ColumnIndex':[],'TotalMissingVals':[],'PercentMissing':[]})
    for i in range(data.shape[1]):
        sum_miss_val = data.iloc[:,i].isna().sum()
        percent_miss_val = round((sum_miss_val/data.shape[0])*100,2)
        miss_df = pd.concat([miss_df,pd.DataFrame({'ColumnIndex': [int(i)],'TotalMissingVals': [sum_miss_val],'PercentMissing': [percent_miss_val]})])
    return miss_df

miss_df = find_missing_percent(df_all)


In [12]:

#drops all the columns that have more than 70% missing values
#and do the same for the test set
"""
make sure that this actually drops the right columns as we have different columns named the same!
"""
drop_cols = miss_df[miss_df['PercentMissing'] > 70].ColumnIndex
drop_cols = np.array(drop_cols).astype(int)
df_all = df_all.drop(df_all.columns[drop_cols],axis=1)
df_all_test = df_all_test.drop(df_all_test.columns[drop_cols],axis=1)


In [13]:
"""
we could also try to use other methods to replace the empty entries for example the k-nearest neighbour method
"""
#fills all empty entries with the average value for that column:
df_all = df_all.fillna(df_all.mean()) #maybe try median
#should we do this as well for the df_test???
df_all_test = df_all_test.fillna(df_all_test.mean())

# 2. Standardization
for the moment I just standardize all rows to be distributed arround 0 with std deviation 1 (not caring about the distribution of the data)

It could be necessery to fix the distribution
 - fixing skewdnedd using a log transformation
 - removing outliers by observing the z-score

In [27]:
#z-score scaling:
# create a scaler object
std_scaler = StandardScaler()
# fit and transform the data
df_all = pd.DataFrame(std_scaler.fit_transform(df_all), columns=df_all.columns)
df_all_test = pd.DataFrame(std_scaler.fit_transform(df_all_test), columns=df_all_test.columns)

**now we are ready to start the classification tasks!**
- **df_all is the training set,**
- **df_train_labels are the corresponding labels**
- **and df_all_test is the test set**


# SUB-TASK 1

**Task:** anticipate the further needs of the patient 
- binary classification (0: no further tests, 1: further tests)
- labels we want to predict:
    - LABEL_BaseExcess, LABEL_Fibrinogen, LABEL_AST, LABEL_Alkalinephos, LABEL_Bilirubin_total, LABEL_Lactate, LABEL_TroponinI, LABEL_SaO2, LABEL_Bilirubin_direct, LABEL_EtCO2.

In [32]:
# extract the labels into the right format for training:
#extract the labels from df_train_labels in which we are interested
labels_task1 = df_train_labels[['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2']]
#extract X and y as np.arrays from the dataframe
X = df_all.to_numpy()[:,1:]
Y = labels_task1.to_numpy()
#test_set
X_test = df_all_test.to_numpy()[:,1:]
#train-test-split
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.9)

In [35]:
"""
#SVC for label 1 in the list of labels
#create probabilistic predictions 
clf = RandomForestClassifier()
clf.fit(x_train,y_train[:,0])
y_pred = clf.predict(x_test)
print('accuracy for i = 0: ',accuracy_score(y_test[:,0],y_pred))
prob = clf.predict_proba(x_test)[:,1].reshape(-1,1)#probability to get a 1

for i in tqdm(range(1,10)):   
    clf = RandomForestClassifier()
    clf.fit(x_train,y_train[:,i])
    y_pred = clf.predict(x_test)
    print(f'accuracy for i = {i}: ',accuracy_score(y_test[:,i],y_pred))
    prob_i = clf.predict_proba(x_test)[:,1].reshape(-1,1)#probability to get a 1
    prob = np.concatenate((prob,prob_i),axis=1)
"""

accuracy for i = 0:  0.8794736842105263


 11%|█████                                        | 1/9 [00:07<00:57,  7.22s/it]

accuracy for i = 1:  0.93


 22%|██████████                                   | 2/9 [00:14<00:49,  7.09s/it]

accuracy for i = 2:  0.7752631578947369


 33%|███████████████                              | 3/9 [00:21<00:42,  7.07s/it]

accuracy for i = 3:  0.7694736842105263


 44%|████████████████████                         | 4/9 [00:28<00:35,  7.07s/it]

accuracy for i = 4:  0.7731578947368422


 56%|█████████████████████████                    | 5/9 [00:34<00:27,  6.87s/it]

accuracy for i = 5:  0.8326315789473684


 67%|██████████████████████████████               | 6/9 [00:40<00:19,  6.43s/it]

accuracy for i = 6:  0.9157894736842105


 78%|███████████████████████████████████          | 7/9 [00:46<00:12,  6.45s/it]

accuracy for i = 7:  0.8215789473684211


 89%|████████████████████████████████████████     | 8/9 [00:54<00:06,  6.71s/it]

accuracy for i = 8:  0.9689473684210527


100%|█████████████████████████████████████████████| 9/9 [00:59<00:00,  6.56s/it]

accuracy for i = 9:  0.9536842105263158





In [36]:
#now that we have tested the accuracy of our model we can now train the final model using the whole data set
clf_1 = RandomForestClassifier()#Random Forest Classifier
clf_1.fit(X,Y[:,0])
pred = clf_1.predict_proba(X_test)[:,1].reshape(-1,1)#probability to get a 1

for i in tqdm(range(1,10)):   
    clf_1 = RandomForestClassifier()
    clf_1.fit(X,Y[:,i])
    pred_i = clf_1.predict_proba(X_test)[:,1].reshape(-1,1)#probability to get a 1
    pred = np.concatenate((pred,pred_i),axis=1)


100%|█████████████████████████████████████████████| 9/9 [01:07<00:00,  7.47s/it]


In [38]:
#put this into a dataframe
df_task1 = pd.DataFrame(pred,columns= ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2'])

# SUB-TASK 2

In [41]:
#extract the labels from df_train_labels in which we are interested
labels_task2 = df_train_labels[['LABEL_Sepsis']]
#extract X and y as np.arrays from the dataframe
Y2 = labels_task2.to_numpy().reshape(-1,1)
#train-test-split
x2_train, x2_test, y2_train, y2_test = train_test_split(X, Y2, train_size=0.9)

In [42]:
"""
#create probabilistic predictions 
#clf2 = svm.SVC(probability = True)#kernel='rbf'
clf2 = RandomForestClassifier()
clf2.fit(x2_train,y2_train.reshape(-1))
y2_pred = clf2.predict(x2_test)
print('accuracy : ',accuracy_score(y2_test,y2_pred))
prob2 = clf2.predict_proba(x2_test)[:,1].reshape(-1,1)#probability to get a 1
"""

accuracy :  0.9457894736842105


In [43]:
#now that we have tested the accuracy of our model we can now train the final model using the whole data set
#clf2 = svm.SVC(probability = True)#kernel='rbf'
clf_2 = RandomForestClassifier()
clf_2.fit(X,Y2.reshape(-1))
prediction_2 = clf_2.predict_proba(X_test)[:,1].reshape(-1,1)#probability to get a 1


In [44]:
df_task2 = pd.DataFrame(prediction_2,columns= ['LABEL_Sepsis'])

(12664, 10)
(12664, 1)


# SUB-TASK 3

In [46]:
"""
#extract the labels from df_train_labels in which we are interested
labels_task3 = df_train_labels[['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']]
#extract X and y as np.arrays from the dataframe
Y3 = labels_task3.to_numpy()

#train-test-split
x3_train, x3_test, y3_train, y3_test = train_test_split(X, Y3, train_size=0.9)
"""

In [49]:
#test_scaled = StandardScaler().fit_transform(x3_test)
reg = RandomForestRegressor(n_estimators=100, random_state=420)
reg.fit(x3_train, y3_train[:,0])#.ravel())
print(reg.score(x3_test, y3_test[:,0]))
# predict on test set
predictions_3 = reg.predict(x3_test).reshape(-1,1)

for i in tqdm(range(1,4)):
    
    reg = RandomForestRegressor(n_estimators=100, random_state=420)
    reg.fit(x3_train, y3_train[:,i])#.ravel())
    print(reg.score(x3_test, y3_test[:,i]))
    # predict on test set
    predictions_3i = reg.predict(x3_test).reshape(-1,1)
    predictions_3 = np.concatenate((predictions_3,predictions_3i),axis=1)

0.3902456224039975


 33%|███████████████                              | 1/3 [01:12<02:24, 72.42s/it]

0.6064882875616302


 67%|██████████████████████████████               | 2/3 [02:31<01:16, 76.56s/it]

0.3661576010486117


100%|█████████████████████████████████████████████| 3/3 [03:45<00:00, 75.12s/it]

0.6523939058597674





In [50]:
#test_scaled_4 = StandardScaler().fit_transform(X_test)
reg4 = RandomForestRegressor(n_estimators=100, random_state=420)
reg4.fit(X, Y3[:,0])#.ravel())
# predict on test set
predictions_4 = reg.predict(X_test).reshape(-1,1)

for i in tqdm(range(1,4)):
    
    reg4 = RandomForestRegressor(n_estimators=100, random_state=420)
    reg4.fit(X, Y3[:,i])#.ravel())
    # predict on test set
    predictions_4i = reg4.predict(X_test).reshape(-1,1)
    predictions_4 = np.concatenate((predictions_4,predictions_4i),axis=1)

100%|█████████████████████████████████████████████| 3/3 [04:19<00:00, 86.35s/it]


In [51]:
df_task3 = pd.DataFrame(predictions_4,columns= ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate'])

# Data export

In [60]:
pid_labels = pd.DataFrame(pids_2.astype(int), columns = ['pid'])
df_handin = pd.concat([pid_labels,df_task1,df_task2,df_task3],axis=1)
df_handin.to_csv('prediction_sorted.csv', index=False, float_format='%.3f')