In [11]:
import pandas as pd
import datetime
import re
import warnings
import numpy as np
warnings.filterwarnings('ignore')

In [13]:
df = pd.read_excel("../data/france_dataset.xlsx")
cols = df.columns

In [14]:
df['date_hospital_admi'] = pd.to_datetime(df['date_hospital_admi'])
df['date_ICU_admi'] = pd.to_datetime(df['date_ICU_admi'])
df['date_hospital_end'] = pd.to_datetime(df['date_hospital_end'], errors='coerce')
df['date_death'] = pd.to_datetime(df['date_death'], errors='coerce')

In [15]:
df.dropna(subset=['ICU_DC'])

Unnamed: 0,id,HR_min,HR_max,SBP_min,SBP_max,DBP_min,DBP_max,SAPSII,RR,GCS,...,ldh_j1,ldh_j2,ldh_j3,ldh_j4,ldh_j5,L_pourc_J1,L_pourc_J2,L_pourc_J3,L_pourc_J4,L_pourc_J5
0,49,61,94,93,190,66,132,27,28,15,...,.,465.00,457.00,412.00,362.0,25.0000,.,.,.,.
1,52,85,105,90,107,56,73,31,28,15,...,652.0,900.00,.,.,.,.,13.3333,.,.,.
2,55,81,170,101,154,48,66,54,20,15,...,498.0,505.00,.,.,596.0,.,.,.,.,.
3,56,79,124,105,132,40,65,27,.,15,...,568.0,.,.,.,.,4.1667,.,.,.,.
4,58,77,90,110,166,54,80,47,.,15,...,455.0,450.00,535.00,.,497.0,10.0000,.,.,4.1176,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,415,87,104,106,137,67,85,33,31,15,...,503.0,.,.,.,.,4.7619,.,6.1224,.,5.2632
174,420,64,89,109,176,69,97,37,32,13,...,466.0,552.00,599.00,.,.,1.9417,.,.,6.1856,6.2500
175,421,52,77,92,159,51,87,48,25,15,...,550.0,.,.,.,.,15.7895,13.2075,.,.,.
176,424,84,97,129,152,77,85,33,26,15,...,579.0,.,.,.,.,6.5041,.,8.6667,.,2.9630


## Budowa modelu

In [16]:
data = df[['date_death','ICU_DC','male', 'Age', 'crp_J1', 'crp_J2', 'crp_J3', 'crp_J4', 'crp_J5',
       'ldh_j1', 'ldh_j2', 'ldh_j3', 'ldh_j4', 'ldh_j5', 'L_pourc_J1',
       'L_pourc_J2', 'L_pourc_J3', 'L_pourc_J4', 'L_pourc_J5']]
data = data.replace('.', np.nan)

#### Preprocessing danych

In [17]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np


data = data.drop('date_death', axis = 1)
cols = data.columns
scaler = MinMaxScaler()
data = scaler.fit_transform(data)
data = pd.DataFrame(data, columns = cols)

data['CRP'] = data[['crp_J1', 'crp_J2',  'crp_J3', 'crp_J4', 'crp_J5']].mean(axis = 1)
data['LDH'] = data[['ldh_j1', 'ldh_j2', 'ldh_j3', 'ldh_j4', 'ldh_j5']].mean(axis = 1)
data['L%'] = data[['L_pourc_J1', 'L_pourc_J2', 'L_pourc_J3', 'L_pourc_J4', 'L_pourc_J5']].mean(axis = 1)

#### Tworzenie modeli

In [18]:
y = data['ICU_DC']
X = data[['male', 'Age', 'CRP', 'LDH', 'L%']]

from sklearn.model_selection import train_test_split, cross_val_score

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)


In [20]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix, classification_report, recall_score, roc_auc_score, precision_score 
import dalex as dx

### Chiński model

In [21]:
model = xgb.XGBClassifier(
            max_depth=4,
            learning_rate=0.2,
            reg_lambda=1,
            n_estimators=150,
            subsample=0.9,
            colsample_bytree=0.9,
            eval_metric = 'aucpr')

categorical_features = ['male']
categorical_transformer = Pipeline(
    steps=[
        #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ]
)
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])
clf.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['male'])])),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=0.9, eval_metric='aucpr',
                               gamma=0, gpu_id=-1, importance_type='gain',
                               interaction_constraints='', learning_rate=0.2,
                               max_delta_step=0, max_depth=4,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=150,
                               n_jobs=4, num_parallel_tree=1, 

In [22]:
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.81      1.00      0.89        29
         1.0       0.00      0.00      0.00         7

    accuracy                           0.81        36
   macro avg       0.40      0.50      0.45        36
weighted avg       0.65      0.81      0.72        36



In [23]:
precision_score(y_test, y_pred) #rozumiem że chiński model, ale żeby aż tak?

0.0

### Wiek i płeć czy krew?

In [24]:
clf_1 = Pipeline(steps=[('classifier', model)])
clf_2 = Pipeline(steps=[('classifier', model)])

age_sex = clf_1.fit(X[['Age','male']], y)
blood = clf_2.fit(X[['LDH', 'CRP', 'L%']], y)

In [25]:
cross_val_score(age_sex, X[['male','Age']], y, cv=3, scoring='precision').mean()

0.27424330055909

In [26]:
cross_val_score(blood, X[['LDH', 'CRP', 'L%']], y, cv = 3, scoring='precision').mean()

0.3834961334961335