In [12]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import *
from sklearn.metrics import roc_curve, auc as auc_score, confusion_matrix, f1_score
from xgboost import XGBClassifier

In [13]:
def get_task(path):
    with open(path, 'rb') as f:
        labels = pickle.load(f)
    dct = {'mort':0, 'readmit': 1, 'los': 2, 'dx':3 }
    task = [yy[dct['mort']] for yy in labels]
    return np.array(task)

In [14]:
#Loading data
X = np.load("./local_mimic/save/X48.npy")
Z = np.load("./local_mimic/save/w2v.npy")
y = get_task("./local_mimic/save/y")

In [15]:
#Data transformations
X, Z, y = np.array(X), np.array(Z), np.array(y)
X = np.append(X, Z, axis=1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [17]:
parameters_space = {
'eta': 2**np.linspace(-10, 0, 10),
'alpha': 2**np.linspace(-10, 10, 10),
'nrounds': np.floor(np.linspace(1, 5000, 10)),
'lambda': 2**np.linspace(-10, 10, 10),
'min_child_weight': 2**np.linspace(0, 7, 10),
'colsample_bytree': np.linspace(0, 1, 10),
'colsample_bylevel': np.linspace(0, 1, 10),}

In [18]:
xgb = XGBClassifier(n_jobs=-1)

In [19]:
random_cv = RandomizedSearchCV(estimator=xgb, 
                               param_distributions=parameters_space, 
                               n_iter=300, 
                               scoring=["roc_auc", "f1", "accuracy"],
                               refit="roc_auc",
                               random_state=123, 
                               verbose=10,
                               n_jobs=-1)

In [20]:
cv_res = random_cv.fit(X_train, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits




Parameters: { "nrounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [23]:
print("Best mean AUC: ")
print(cv_res.best_score_)
cv_res_df = pd.DataFrame(cv_res.cv_results_)
cv_res_df.to_csv("CV_results_2.csv")
cv_res.best_estimator_.save_model("CV_best_model2")

Best mean AUC: 
0.9174918953294815


In [24]:
#Score on test data
print("Test DF AUC: ")
print(cv_res.score(X_test, y_test))

Test DF AUC: 
0.9279470211134784
