In [None]:
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import os
import numpy as np

In [None]:
data_location = os.path.normpath('../data')

possible_data_files = ["hackathon_low_mixed_venous_oximetry.csv", "hackathon_low_cardiac_output.csv"]

data_paths = [os.path.join(data_location, data_file) for data_file in possible_data_files]

CHOSEN_TASK = 1
data_path = data_paths[CHOSEN_TASK]


test_data_files = ["hackathon_low_mixed_venous_oximetry_test_set.csv", "hackathon_low_cardiac_output_test_set.csv"]

df_test_new = os.path.join(data_location, test_data_files[CHOSEN_TASK])
df_test_new = pd.read_csv(df_test_new)

df_test_new = (
    df_test_new
    .assign(ClassificationLabel = lambda df: df.ClassificationLabel=="Positive")
    .assign(gender=lambda df:df.gender=="F")
)

In [None]:
df_raw = (
    pd.read_csv(data_path)
)

## Custom columns and names per file

In [None]:
def bsa(weight, height):
    return np.sqrt(weight*height*100)/60
    return pow(weight,0.425)*pow(height*100,0.725)*0.007184

def fun(s):
    return bsa(s.weight, s.height)

In [None]:
feature_columns = [col for col in df_raw.columns if col not in ["event_count",
                                                            "ClassificationLabel",
                                                           "subject_id"]  and "Regression" not in col]

if 'cardiac_output' in data_path:
    regression_label = "RegressionLabel-CardiacIndex"
#     remove data with wrong BSA
    df_raw = df_raw.loc[lambda df: (df.apply(lambda df: fun(df), axis = 1) - df.bsa) < 0.1]
else:
    regression_label = "RegressionLabel-SvO2"

# Investigations

In [None]:
print("Number of rows")
len(df_raw)

In [None]:
print("Number of rows per patient")
df_raw.groupby("subject_id").count().event_count.value_counts().sort_index().plot.bar()
plt.show()

In [None]:
print("Number of nans per row")
df_raw.isna().sum(axis=1).value_counts().sort_index().plot.bar()
plt.show()

print("Number of nans per column")
df_raw.isna().sum(axis=0).plot.bar()
plt.show()

# Preprocessing

In [None]:
df = df_raw.copy()

In [None]:
# drop patients with suspicion of Pulmonary hypertension
df.loc[lambda df: (df.Pulmonary_Artery_Mean_Pressure > 100) | (df.Pulmonary_Artery_Mean_Pressure == 0),"Pulmonary_Artery_Mean_Pressure"] = np.nan
df.loc[lambda df: df.Central_Venous_Pressure > 100, "Central_Venous_Pressure"] = np.nan
if "End_Diastolic_Volume" in df.columns:
    df.loc[lambda df: df.End_Diastolic_Volume > 400, "End_Diastolic_Volume"] = np.nan
df.loc[lambda df: df.Heart_Rate < 40, "Heart_Rate"] = np.nan

In [None]:
df = (
    df
    .assign(ClassificationLabel = lambda df: df.ClassificationLabel=="Positive")
    .assign(gender=lambda df:df.gender=="F")
)

mean_columns = feature_columns+["subject_id"]
for col in mean_columns:
    if df[col].dtypes in ['object', 'bool', 'str']:
        mean_columns.remove(col)
        
means = df[mean_columns].groupby("subject_id").mean()

FILL_IN = "overall_mean"
# FILL_IN = "drop"
# FILL_IN = "patient_mean"
# FILL_IN = "keep"

if FILL_IN == "patient_mean":
    to_drop = []
    for row_id, row in df[mean_columns].iterrows():
        for col, elem in row.items():
            if pd.isna(elem):
                new = means.loc[row.subject_id, col]
                if pd.isna(new):
                    to_drop.append(row_id)
                else:    
                    df.loc[row_id, col] = new

    df = df.loc[lambda df: ~df.index.isin(to_drop)]
    print(f"Dropped {len(to_drop)} rows because we were unabble to fill them")

elif FILL_IN == "overall_mean": 
    for col in feature_columns:
        df[col].fillna(df[col].mean(), inplace=True)
elif FILL_IN == "drop":
    previous_len = len(df)
    df = df.dropna()
    print(f"Dropped {previous_len - len(df)}")
    
    
means = means.add_suffix('___mean')

# Train test split

In [None]:
import random
random.seed(0)

subjects = list(set(df.subject_id))
random.shuffle(subjects)
N = int(len(subjects)/5*4)
train_subjects = subjects[:N]
test_subjects = subjects[N:]


df_train=df.loc[lambda df: df.subject_id.isin(train_subjects)]
df_test=df.loc[lambda df: df.subject_id.isin(test_subjects)]

print(len(df_test))
print(len(df_train))

# Model training High risk

In [None]:
from sklearn.model_selection import train_test_split
threshold_training = 2.6
threshold_validation_test = 2.4
thresh = threshold_validation_test
X = df_train[feature_columns].values
y = df_train[regression_label]<threshold_training

X_val = df_test[feature_columns].values
y_val = df_test[regression_label]<threshold_validation_test


X_test = df_test_new[feature_columns].values
y_test = df_test_new[regression_label]<threshold_validation_test


In [None]:
from sklearn import tree
import xgboost
# clf = tree.DecisionTreeClassifier(max_depth=2)
clf = xgboost.XGBClassifier(scale_pos_weight=1,
                            max_depth=2,
                            subsample=1,
                            colsample_bytree=1,
                            min_child_weight=40
                           )
clf = clf.fit(X,y)

In [None]:
from yellowbrick.classifier import ClassBalance, ROCAUC, ClassificationReport, ClassPredictionError

clf.target_type_ = int
rocauc = ROCAUC(clf, size=(700, 500), classes=[0,1])

rocauc.score(X_test, y_test)  
r = rocauc.poof()

## Evaluation

In [None]:
25/35

In [None]:
THRESHOLD = 0.044
y_test_predict = clf.predict_proba(X_test)[:,0] < THRESHOLD
import sklearn.metrics as metrics
conf_matrix = metrics.confusion_matrix(y_test, y_test_predict)
conf_df = pd.DataFrame(conf_matrix,
            index=[f"label_{i}" for i in range(0, max(y_test)+1)],
            columns=[f"pred_{i}" for i in range(0, max(y_test)+1)])
conf_df.loc["label_0"] = 10*conf_df.loc["label_0"]
conf_df

In [None]:
predict = clf.predict_proba(df[feature_columns].values)[:,0] < THRESHOLD

high_risk_patients = df.loc[predict]
print(f"Number of high risk in whole data {len(high_risk_patients)}")
print(f"Number of mistakes {(high_risk_patients[regression_label]>thres).sum()*10}")

In [None]:
high_risk_patients_summary = suspicious_pacients[feature_columns].mean()

# Model training low risk

In [None]:
from sklearn.model_selection import train_test_split
threshold_training = 2.7
threshold_validation_test = 2.4
thresh = threshold_validation_test
X = df_train[feature_columns].values
y = df_train[regression_label]<threshold_training

X_val = df_test[feature_columns].values
y_val = df_test[regression_label]<threshold_validation_test


X_test = df_test_new[feature_columns].values
y_test = df_test_new[regression_label]<threshold_validation_test


In [None]:
from sklearn import tree
import xgboost
# clf = tree.DecisionTreeClassifier(max_depth=2)
clf = xgboost.XGBClassifier(scale_pos_weight=1,
                            max_depth=2,
                            subsample=1,
                            colsample_bytree=1,
                            min_child_weight=40
                           )
clf = clf.fit(X,y)

In [None]:
from yellowbrick.classifier import ClassBalance, ROCAUC, ClassificationReport, ClassPredictionError

clf.target_type_ = int
rocauc = ROCAUC(clf, size=(700, 500), classes=[0,1])

rocauc.score(X_test, y_test)  
r = rocauc.poof()

In [None]:
THRESHOLD = 0.7
y_test_predict = clf.predict_proba(X_test)[:,0] < THRESHOLD
import sklearn.metrics as metrics
conf_matrix = metrics.confusion_matrix(y_test, y_test_predict)
conf_df = pd.DataFrame(conf_matrix,
            index=[f"label_{i}" for i in range(0, max(y_test)+1)],
            columns=[f"pred_{i}" for i in range(0, max(y_test)+1)])
conf_df.loc["label_0"] = 10*conf_df.loc["label_0"]
conf_df

In [None]:
predict = clf.predict_proba(df[feature_columns].values)[:,0] > THRESHOLD
suspicious_pacients = df.loc[predict]
print(f"number of identified {len(suspicious_pacients)*10}")
print(f"Number of mistakes {(suspicious_pacients[regression_label]<thresh).sum()}")

In [None]:
comparison = suspicious_pacients[feature_columns].mean().to_frame(name="suspicuos_negative")

comparison["rest_negative"] = df_test[feature_columns].loc[~predict & (df[regression_label]>thres)].mean()
comparison["rest_positive"] = df_test[feature_columns].loc[~predict & (df[regression_label]<thres)].mean()
comparison["suspicious_positive"] = series_bad
 
comparison