In [None]:
import pandas as pd
import numpy as np
import math
import statistics
import datetime as dt
from matplotlib import pyplot as plt

from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.preprocessing import OneHotEncoder
from sksurv.metrics import concordance_index_censored

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('imputed.csv')

In [None]:
df['icu_stay_days'] = [round(i/86400) for i in df['icu_stay_duration']]
df['icu_stay_duration'] /= 3600  # now icu_stay_duration
df = df[[i for i in df.columns if i not in ("Unnamed: 0", "0")]]

In [None]:
categorical = ['ethnicity', 
              'marital_status',
              'language',
              'admission_location',
              'gender',
              'insurance',
              'first_careunit',
              'last_careunit',
              'admission_type']
proceduretype=['aortic','mit','tricuspid','pulmonary','cabg']
ptParams = ['weight', 'height']
boolFields = ['reintubation', 'liver_severe', 'liver_mild', 'rheum', 'cvd', 'aids', 'ckd', 'copd', 'arrhythmia', 'pud', 'smoking', 'pvd', 'paraplegia', 
              'ccf', 'met_ca', 't2dm', 't1dm', 'malig', 'mi', 'dementia', 'hospital_expire_flag', 'diab_un', 'diab_cc',]
deathInfo = ['dod', 'deathtime']
ptinfo = ['hadm_id', 'subject_id']
durations = ['duration1', 'icu_stay_duration', 'icu_stay_days']

tsColumns = [i for i in df.columns if '_max' in i or '_min' in i or '_mean' in i]
print([i for i in df.columns if i not in categorical + proceduretype + tsColumns + ptParams + boolFields + ptinfo + deathInfo + durations])

for i in categorical:
    df[i] = df[i].astype('category')
    
df = df[[i for i in df.columns if '_max' not in i and '_min' not in i]]
df = df[[i for i in df.columns if i not in ('last_careunit')]]  # for some reason last_careunit messes up the Cox training

print(df.shape)
df

In [None]:
# target variable: icu stay duration
data_y = np.array([(True, df['icu_stay_duration'][i]) for i in range(df.shape[0])], dtype=[('Status', '?'), ('Stay_in_hrs', '<f8')])
data_y

In [None]:
%matplotlib inline

time, survival_prob = kaplan_meier_estimator(data_y["Status"], data_y["Stay_in_hrs"])
plt.step(time, survival_prob, where="post")
plt.ylabel("est. probability of stay $\hat{S}(t)$")
plt.xlabel("time $t$")

In [None]:
# one hot encodes categorical variables and removes columns related to irrelevant stuff
data_x_numeric = OneHotEncoder().fit_transform(df[[i for i in df.columns if i not in durations + ptinfo + deathInfo]])
data_x_numeric

In [None]:
estimator = CoxPHSurvivalAnalysis()
estimator.fit(data_x_numeric, data_y)

In [None]:
# get concordance index of trained estimator
prediction = estimator.predict(data_x_numeric)
result = concordance_index_censored(data_y["Status"], data_y["Stay_in_hrs"], prediction)
result[0]

In [None]:
# which variables are predictive?
def fit_and_score_features(X, y):
    n_features = X.shape[1]
    scores = np.empty(n_features)
    m = CoxPHSurvivalAnalysis()
    for j in range(n_features):
        Xj = X[:, j:j+1]
        m.fit(Xj, y)
        scores[j] = m.score(Xj, y)
    return scores

scores = fit_and_score_features(data_x_numeric.values, data_y)

In [None]:
pd.set_option('display.max_rows', None)
print(pd.Series(scores, index=data_x_numeric.columns).sort_values(ascending=False))
pd.reset_option('display.max_rows')