# Praca domowa 4
Odtworzenie modeli z artykułu [Predictive modeling in urgent care: a comparative study of machine learning approaches](https://academic.oup.com/jamiaopen/article/1/1/87/5032901) .

Link do repozytorium tego artykułu: [https://github.com/illidanlab/urgent-care-comparative](https://github.com/illidanlab/urgent-care-comparative) .

## Funkcje pomocnicze
Poniższe funkcje zostały wzięte z repozytorium powyższego artykułu.

In [35]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc as auc_score, confusion_matrix, f1_score
from sklearn.model_selection import *
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
def balanced_subsample(x,y,subsample_size=1.0):
    class_xs = []
    min_elems = None
    for yi in np.unique(y):
        elems = x[(y == yi)]
        class_xs.append((yi, elems))
        if min_elems == None or elems.shape[0] < min_elems:
            min_elems = elems.shape[0]
    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)
    xs = []
    ys = []
    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
            np.random.shuffle(this_xs)

        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)

        xs.append(x_)
        ys.append(y_)

    xs = np.concatenate(xs)
    ys = np.concatenate(ys)

    return xs,ys

In [3]:
def single_score(y_te, yhat):
    fpr, tpr, thresholds = roc_curve(y_te, yhat)
    roc_auc = auc_score(fpr, tpr)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    yhat[yhat>=optimal_threshold]=1; yhat[yhat<optimal_threshold]=0
    yhat=[int(i) for i in yhat]
    #matrix = confusion_matrix(y_te, yhat)
    tn, fp, fn, tp = confusion_matrix(y_te, yhat).ravel()
    sen=1.0* (tp/(tp+fn))
    spec=1.0* (tn/(tn+fp))
    f1=f1_score(y_te,yhat)
    return roc_auc, f1, sen, spec

In [4]:
# this function was simplified a bit
def get_task():
    # returns target for in-hospital mortality
    with open("y", "rb") as f:
        labels = pickle.load(f)
    
    temp = [yy[0] for yy in labels]
    
    return np.array(temp)

In [None]:
def pipeline(X, y, model_type):
    skf = StratifiedKFold(n_splits=5)
    data = {}
    count = 0

    X,y= np.array(X), np.array(y)
    
    for train_index, test_index in skf.split(X, y):
        count +=1
        
        X_tr, X_te = X[train_index], X[test_index]
        y_tr, y_te = y[train_index], y[test_index]
        
        if len(X_tr.shape) >2:
            input_shape = (X_tr.shape[1], X_tr.shape[-1])
        else:
            input_shape = (X_tr.shape[-1],)
            
        if model_type == "mlp":
            model = mlp_model(input_shape) 
        elif model_type == "rfc":
            model = RF(n_estimators = 450, verbose = 1, n_jobs = -1)
        elif model_type == "gbc":
            model = GBC(n_estimators = 400, learning_rate = 0.09, verbose = 1)
        else:
            model = None
        
        #train over epochs
        for e in range(100):
            
            if model_type != "mlp":
                break
                
            #subsample
            xs, ys = balanced_subsample(X_tr, y_tr, 1.0)
            ys = np.array([[i] for i in ys])

            model.fit(x = xs, y=ys, 
                      batch_size = 32,
                      validation_split=.2, 
                      epochs=1, 
                      verbose=0)
            
        if model_type != "mlp":
            xs, ys = balanced_subsample(X_tr, y_tr, 1)
            model.fit(xs, ys)

        yhat = model.predict(X_tr)
        tr_auc, _, _, _ = single_score(y_tr, yhat)
    
        yhat2 = model.predict(X_te)
        te_auc, f1_score, sen, spec = single_score(y_te, yhat2)
            
        data[count] = {'tr_auc': tr_auc, 'f1_score':f1_score, 'te_auc': te_auc, 'sen':sen, 'spec': spec}
    
    return model, data

## Dane po preprocessingu

- Zmienna `X` została stworzona na podstawie cech, będących szeregami czasowymi. Każda taka cecha przerobiona została na min, średnią, max oraz odchylenie standardowe.

In [5]:
X = np.load("X48.npy")
X

array([[4.22535211e-02, 0.00000000e+00, 5.43478261e-02, ...,
        4.03017024e-01, 1.33952979e-01, 4.75067826e-01],
       [2.58215962e-01, 1.18421053e-01, 3.26086957e-01, ...,
        4.10958588e-14, 0.00000000e+00, 0.00000000e+00],
       [2.11267606e-01, 4.05553814e-01, 8.69565217e-02, ...,
        2.52207581e-01, 1.88907108e-01, 2.31845699e-01],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        4.10958588e-14, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.10526316e-01, 0.00000000e+00, ...,
        2.64952543e-01, 2.88610525e-01, 4.89848547e-02],
       [3.20522201e-01, 1.57894737e-01, 0.00000000e+00, ...,
        4.83805376e-02, 3.83338758e-02, 1.06946021e-01]])

- Zmienna `y` jest wektorem 0-1. 0 oznacza, iż dany pacjent przeżył, 1, że nie.

In [6]:
y = get_task()
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Odtworzenie modeli

Poniżej reprodukowane będą modele klasyczne do zadania `in-hospital mortality`.

In [None]:
from keras.models import *
from keras.layers import *
from tensorflow.keras import regularizers
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.optimizers import *
from keras.regularizers import *
from keras import metrics
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import GradientBoostingClassifier as GBC

res = [] # to save results from model testing

### MLP

In [8]:
def mlp_model(input_shape, hidden = 256, targets = 1, multiclass = False, learn_rate = 1e-4):
    model = Sequential()
    model.add(Dense(hidden, activation = 'relu', input_shape = input_shape))
    model.add(Dropout(.5))
    model.add(Dense(hidden, activation = 'relu'))
    model.add(Dropout(.5))
    model.add(Dense(hidden, activation = 'relu'))
    model.add(Dropout(.5))
    model.add(Dense(targets))
    if multiclass:
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=learn_rate, beta_1 =.5 ), 
                      metrics=['categorical_accuracy'])
    else:
        model.add(Activation('sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer=Adam(lr=learn_rate, beta_1 =.5 ), metrics=['accuracy'])
    return (model)

In [27]:
model, data = pipeline(X, y, "mlp")

In [64]:
res.append(pd.DataFrame(data))

### Random forest

In [None]:
model_rfc, data_rfc = pipeline(X, y, "rfc")

In [65]:
res.append(pd.DataFrame(data_rfc))

### Gradient boosting

In [None]:
model_gbc, data_gbc = pipeline(X, y, "gbc")

In [66]:
res.append(pd.DataFrame(data_gbc))

## Wnioski

In [78]:
columns = ["mlp_av", "mlp_std", "rfc_av", "rfc_std", "gbc_av", "gbc_std"]
results = None

for i in range(len(res)):
    results = pd.concat([results, res[i].mean(axis=1)], axis=1)
    results = pd.concat([results, res[i].std(axis=1)], axis=1)

results.columns = columns
results

Unnamed: 0,mlp_av,mlp_std,rfc_av,rfc_std,gbc_av,gbc_std
tr_auc,0.873796,0.003013,0.883403,0.002567,0.848945,0.003959
f1_score,0.429742,0.016478,0.424311,0.012273,0.448952,0.016758
te_auc,0.848446,0.008341,0.767847,0.011077,0.775384,0.01724
sen,0.809055,0.020538,0.806024,0.033422,0.781135,0.0483
spec,0.734357,0.024202,0.72967,0.02338,0.769634,0.025135


Wyniki uzyskane przez autorów artykułu są w **Table 1**.
<img src="results.png" width="500" />

### MLP
Zreprodukowane MLP dało podobny jak w artykule AUC, a reszta metryk(mimo że była w miarę bliska) była gorsza. Wynika to najprawdopodobniej z tego, że autorzy nie podali wszystkich parametrów swojego najlepszego modelu MLP(przy tej próbie odtworzenia użyte zostały domyślne).
### Random forest
Tutaj wszystkie wyniki były gorsze od tych z artykułu. Przyczyna jak wyżej.
### Gradient boosting
Jedyny model, który powtórzył wyniki autorów artykułu. Warto jednak zaznaczyć, że jego odpowiednik z artykułu miał najsłabsze wyniki(w naszym przypadku najgorzej wypadł `random forest`).