In [118]:
import pandas as pd 
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix, classification_report, recall_score, roc_auc_score, precision_score 
import dalex as dx

In [119]:
data = pd.read_excel('data/42256_2020_253_MOESM1_ESM.xlsx')

In [120]:
data["Days from admission to death"].unique()

array([   nan,  3.5  , 23.5  ,  4.5  , 12.5  , 13.5  , 16.5  , 17.5  ,
       18.5  ,  2.5  ,  5.5  ,  6.5  ,  7.5  ,  8.5  , 11.5  , 14.5  ,
       19.5  , 10.5  ,  1.5  , 33.5  ,  9.5  , 26.5  ,  9.875, 20.5  ])

In [121]:
data.iloc[170]

Age                                                     66
Gender                                                Male
Date of presentation emergency room    2020-04-02 00:00:00
Date of admission                      2020-04-02 00:00:00
Date of discharge                      2020-04-04 00:00:00
Admission to ICU                                        No
Survival/death                                       Alive
Date of death                                          NaT
Days from admission to death                           NaN
Date blood analysis                    2020-04-02 00:00:00
LD                                                     176
CRP                                                     64
Lymphocytes                                           9.15
Leukocytes                                            14.9
Percentage lymphocytes                           61.409396
Name: 170, dtype: object

In [122]:
data.head()

Unnamed: 0,Age,Gender,Date of presentation emergency room,Date of admission,Date of discharge,Admission to ICU,Survival/death,Date of death,Days from admission to death,Date blood analysis,LD,CRP,Lymphocytes,Leukocytes,Percentage lymphocytes
0,73,Male,2020-03-19,2020-03-19 00:00:00,2020-05-26 00:00:00,Yes,Alive,NaT,,2020-03-19,485,154,0.62,6.6,9.393939
1,60,Female,2020-03-18,2020-03-18 00:00:00,2020-03-22 00:00:00,No,Alive,NaT,,2020-03-20,316,77,1.61,7.7,20.909091
2,44,Female,2020-03-20,2020-03-20 00:00:00,2020-03-24 00:00:00,No,Alive,NaT,,2020-03-20,444,82,0.81,6.8,11.911765
3,58,Male,2020-03-20,2020-03-20 00:00:00,2020-03-23 00:00:00,Yes,Alive,NaT,,2020-03-20,323,58,0.73,5.8,12.586207
4,66,Male,2020-03-20,2020-03-20 00:00:00,2020-03-25 00:00:00,No,Alive,NaT,,2020-03-20,244,20,1.39,5.1,27.254902


In [123]:
df = data[["Age", "Gender", "LD", "CRP", "Percentage lymphocytes", "Survival/death"]]

In [124]:
df = df.assign(death = np.where(df["Survival/death"] == 'Alive', 0, 1))

In [125]:
df = df[["Age", "Gender", "LD", "CRP", "Percentage lymphocytes", "death"]]

In [126]:
X = df.drop("death", axis = 1)
y = df.death

In [127]:
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)

In [128]:
model = xgb.XGBClassifier(
            max_depth=4,
            learning_rate=0.2,
            reg_lambda=1,
            n_estimators=150,
            subsample=0.9,
            colsample_bytree=0.9,
            eval_metric = 'aucpr')

In [129]:
numerical_features = ['Age', 'LD', 'CRP', 'Percentage lymphocytes']
numerical_transformer = Pipeline(
    steps=[
        #('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

categorical_features = ['Gender']
categorical_transformer = Pipeline(
    steps=[
        #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)



In [130]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

In [131]:
clf.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'LD', 'CRP',
                                                   'Percentage lymphocytes']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Gender'])])),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,...
                               gamma=0, gpu_id=-1, importance_type='gain',
                               interact

In [132]:
y_pred = clf.predict(x_test)

In [133]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88        50
           1       0.46      0.55      0.50        11

    accuracy                           0.80        61
   macro avg       0.68      0.70      0.69        61
weighted avg       0.82      0.80      0.81        61



In [134]:
precision_score(y_test, y_pred)

0.46153846153846156

In [135]:
exp = dx.Explainer(clf, x_test, y_test)

Preparation of a new explainer is initiated

  -> data              : 61 rows 5 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 61 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7f581209b160> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.000108, mean = 0.215, max = 0.975
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.971, mean = -0.0346, max = 0.999
  -> model_info        : package sklearn

A new explainer has been created!


In [136]:
exp.model_parts().plot()

In [137]:
pdp_num = exp.model_profile(type = 'partial', label="pdp")

Calculating ceteris paribus: 100%|██████████| 5/5 [00:00<00:00, 50.26it/s]


In [138]:
pdp_num.plot()

# Powinniśmy patrzeć na krew czy wygląd ludzi? 

In [139]:
numerical_features = ['Age']
numerical_transformer = Pipeline(
    steps=[('scaler', StandardScaler())])

categorical_features = ['Gender']
categorical_transformer = Pipeline(
    steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

clf_1 = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

numerical_features = ['LD', 'CRP', 'Percentage lymphocytes']
numerical_transformer = Pipeline(
    steps=[
        #('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),]
)

clf_2 = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])


In [140]:
age_sex = clf_1.fit(X[['Age','Gender']], y)
blood = clf_2.fit(X[['LD', 'CRP', 'Percentage lymphocytes']], y)

In [141]:
age_sex_exp = dx.Explainer(age_sex,X[['Age','Gender']], y)
blood_exp = dx.Explainer(blood, X[['LD', 'CRP', 'Percentage lymphocytes']], y)

Preparation of a new explainer is initiated

  -> data              : 305 rows 2 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 305 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7f581209b160> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.00177, mean = 0.148, max = 0.745
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.745, mean = 0.0521, max = 0.995
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 305 rows 3 cols
  -> target variable   : Para

In [142]:
arena=dx.Arena()

age_sex_exp = dx.Explainer(age_sex,X[['Age','Gender']], y)
blood_exp = dx.Explainer(blood, X[['LD', 'CRP', 'Percentage lymphocytes']], y)
age_sex_exp.label = "age_sex"
blood_exp.label = "blood"

arena.push_model(age_sex_exp)
arena.push_model(blood_exp)
arena.push_observations(df)
arena.run_server(port=9294)

Preparation of a new explainer is initiated

  -> data              : 305 rows 2 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 305 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7f581209b160> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.00177, mean = 0.148, max = 0.745
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.745, mean = 0.0521, max = 0.995
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 305 rows 3 cols
  -> target variable   : Para

Exception in thread Thread-39:
Traceback (most recent call last):
  File "/home/hania/anaconda3/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/home/hania/anaconda3/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/hania/anaconda3/lib/python3.8/site-packages/dalex/arena/server.py", line 97, in start_server
    app.run(host=host, port=port)
  File "/home/hania/.local/lib/python3.8/site-packages/flask/app.py", line 990, in run
    run_simple(host, port, self, **options)
  File "/home/hania/.local/lib/python3.8/site-packages/werkzeug/serving.py", line 1052, in run_simple
    inner()
  File "/home/hania/.local/lib/python3.8/site-packages/werkzeug/serving.py", line 996, in inner
    srv = make_server(
  File "/home/hania/.local/lib/python3.8/site-packages/werkzeug/serving.py", line 847, in make_server
    return ThreadedWSGIServer(
  File "/home/hania/.local/lib/python3.8/site-packages/werkzeug/servin

In [104]:
arena.stop_server()

In [105]:
cross_val_score(age_sex, X[['Age','Gender']], y, cv=3, scoring='precision').mean()

0.40165631469979296

In [106]:
cross_val_score(blood, X[['LD', 'CRP', 'Percentage lymphocytes']], y, cv = 3, scoring='precision').mean()

0.26666666666666666

## Czyli nie trzeba pobierać krwi, wystarczy na kogoś popatrzeć! 

# A jak wygląda ta sprawa w Chinach? 

In [107]:
china = pd.read_excel('data/time_series_375_prerpocess_en.xlsx')
china

Unnamed: 0,PATIENT_ID,RE_DATE,age,gender,Admission time,Discharge time,outcome,Hypersensitive cardiac troponinI,hemoglobin,Serum chloride,...,mean corpuscular hemoglobin,Activation of partial thromboplastin time,High sensitivity C-reactive protein,HIV antibody quantification,serum sodium,thrombocytocrit,ESR,glutamic-pyruvic transaminase,eGFR,creatinine
0,1.0,2020-01-31 01:09:00.000000,73,1,2020-01-30 22:12:47,2020-02-17 12:40:09,0,,,,...,,,,,,,,,,
1,,2020-01-31 01:25:00.000000,73,1,2020-01-30 22:12:47,2020-02-17 12:40:09,0,,136.0,,...,31.9,,,,,0.12,,,,
2,,2020-01-31 01:44:00.000000,73,1,2020-01-30 22:12:47,2020-02-17 12:40:09,0,,,103.1,...,,,43.1,,137.7,,,16.0,46.6,130.0
3,,2020-01-31 01:44:59.999999,73,1,2020-01-30 22:12:47,2020-02-17 12:40:09,0,,,,...,,,,,,,,,,
4,,2020-01-31 01:56:00.000001,73,1,2020-01-30 22:12:47,2020-02-17 12:40:09,0,19.9,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6115,,2020-02-16 11:21:00.000001,68,1,2020-02-08 23:25:01,2020-02-19 01:31:58,1,84.9,,,...,,,,,,,,,,
6116,,2020-02-16 12:04:00.000000,68,1,2020-02-08 23:25:01,2020-02-19 01:31:58,1,,,,...,,,,,,,,,,
6117,,2020-02-16 12:14:00.000000,68,1,2020-02-08 23:25:01,2020-02-19 01:31:58,1,,,105.2,...,,,267.0,,139.3,,,17.0,88.6,77.0
6118,,2020-02-16 14:11:00.000000,68,1,2020-02-08 23:25:01,2020-02-19 01:31:58,1,,155.0,,...,31.6,,,,,,,,,


In [108]:
from raport_v1.utils_features_selection import data_read_and_split

In [109]:
X_data_all_features, Y_data, x_col = data_read_and_split()

In [110]:
blood_df = X_data_all_features
blood_df.head()

Unnamed: 0_level_0,红细胞计数,白细胞计数,红细胞压积,球蛋白,平均血红蛋白浓度,单核细胞(#),乳酸脱氢酶,尿素,淋巴细胞(#),γ-谷氨酰转肽酶,...,凝血酶原活动度,葡萄糖,RBC分布宽度SD,RBC分布宽度CV,平均PLT体积,血小板压积,大血小板比率,PLT分布宽度,D-D二聚体定量,降钙素原
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.05,9.67,38.0,30.1,345.0,0.76,206.0,6.5,2.5,41.0,...,115.0,6.75,41.5,11.9,11.3,0.16,36.9,14.3,0.92,0.09
2,4.63,10.37,43.6,33.2,342.0,0.59,282.0,3.5,0.95,50.0,...,117.0,11.32,44.1,13.1,9.4,0.27,19.8,10.1,0.44,0.09
3,3.78,7.68,35.7,29.8,353.0,0.47,226.0,4.22,2.1,53.0,...,94.0,9.42,42.7,12.6,9.7,0.23,21.4,10.1,0.98,0.06
4,2.63,6.78,29.8,31.9,346.0,0.58,249.0,3.6,1.12,14.0,...,68.0,5.78,59.3,14.3,9.0,0.27,16.3,8.1,1.26,0.38
5,4.34,7.95,37.9,30.8,343.0,0.59,179.0,3.0,1.88,21.0,...,83.0,4.84,39.4,12.4,10.0,0.36,24.3,11.1,0.42,0.02


In [111]:
age_sex_df = china.loc[china.PATIENT_ID.isin(X_data_all_features.index)][['age','gender','outcome']]
y = age_sex_df.outcome
age_sex_df = age_sex_df.drop('outcome', axis =1)

In [112]:
age_sex_df.head()

Unnamed: 0,age,gender
0,73,1
24,61,1
53,70,2
71,74,1
85,29,2


In [113]:
blood_df

Unnamed: 0_level_0,红细胞计数,白细胞计数,红细胞压积,球蛋白,平均血红蛋白浓度,单核细胞(#),乳酸脱氢酶,尿素,淋巴细胞(#),γ-谷氨酰转肽酶,...,凝血酶原活动度,葡萄糖,RBC分布宽度SD,RBC分布宽度CV,平均PLT体积,血小板压积,大血小板比率,PLT分布宽度,D-D二聚体定量,降钙素原
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.05,9.67,38.0,30.1,345.0,0.76,206.0,6.50,2.50,41.0,...,115.0,6.75,41.5,11.9,11.3,0.16,36.9,14.3,0.92,0.09
2,4.63,10.37,43.6,33.2,342.0,0.59,282.0,3.50,0.95,50.0,...,117.0,11.32,44.1,13.1,9.4,0.27,19.8,10.1,0.44,0.09
3,3.78,7.68,35.7,29.8,353.0,0.47,226.0,4.22,2.10,53.0,...,94.0,9.42,42.7,12.6,9.7,0.23,21.4,10.1,0.98,0.06
4,2.63,6.78,29.8,31.9,346.0,0.58,249.0,3.60,1.12,14.0,...,68.0,5.78,59.3,14.3,9.0,0.27,16.3,8.1,1.26,0.38
5,4.34,7.95,37.9,30.8,343.0,0.59,179.0,3.00,1.88,21.0,...,83.0,4.84,39.4,12.4,10.0,0.36,24.3,11.1,0.42,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,4.70,5.12,41.3,33.3,346.0,0.28,573.0,5.30,0.77,27.0,...,86.0,7.07,38.8,11.9,10.9,0.16,33.0,13.1,2.57,1.51
372,3.35,28.04,29.1,29.6,351.0,0.26,607.0,31.30,0.17,135.0,...,27.0,5.78,48.8,15.5,14.0,0.07,54.1,25.3,11.11,1.14
373,3.62,12.13,29.3,36.9,341.0,0.55,702.0,5.20,0.76,39.0,...,77.0,8.53,38.7,13.9,12.9,0.18,46.1,19.7,-1.00,0.56
374,3.94,14.49,34.7,30.1,349.0,0.42,1867.0,18.40,0.33,176.0,...,26.0,10.28,-1.0,-1.0,10.8,0.13,30.1,12.4,21.00,-1.00


In [114]:
age_sex_df_model = model.fit(age_sex_df, y)
blood_model = model.fit(blood_df, y)

In [115]:
cross_val_score(age_sex_df_model, age_sex_df, y, cv = 3, scoring='precision').mean()

0.7271326079052155

In [116]:
cross_val_score(blood_model, blood_df, y, cv = 3, scoring='precision').mean()

0.948798328108673