### Import all the required libraries

In [None]:
import json
import string
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import timeit

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from collections import Counter
from gensim.models import Word2Vec

import pickle
import re
import hashlib
import scipy.stats as st

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import f1_score, fbeta_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc
import xgboost as xgb
from tqdm import tqdm

from scipy.sparse import hstack, csr_matrix

warnings.filterwarnings("ignore")

nltk.download("stopwords")
nltk.download("punkt")

## Helper functions

In [None]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
def y_binary(y_pred, threshold):
    return (y_pred >= threshold).astype('int')


def model_performance(y_test, y_pred_bi, y_pred, f_score, value_dict):

    conf_mat = pd.DataFrame(confusion_matrix(y_test, y_pred_bi),
                            columns=['Pred0', 'Pred1'],
                            index=['True0', 'True1'])
    precision_val = np.round(
        np.diagonal(conf_mat) / np.sum(conf_mat, axis=0) * 100, 2)
    recall_val = np.round(
        np.diagonal(conf_mat) / np.sum(conf_mat, axis=1) * 100, 2)
    accuracy_val = np.round(
        np.sum(np.diagonal(conf_mat)) / np.sum(conf_mat.values) * 100, 2)
    auc_test = roc_auc_score(y_test, y_pred)
    # Data to plot precision - recall curve
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)
    value_dict['f_score'].append(f_score)
    value_dict['accuracy'].append(accuracy_val)
    value_dict['recall'].append(recall_val[1])
    value_dict['precision'].append(precision_val[1])  
    print('CONFUSION MATRIX: \n {}'.format(conf_mat))
    print('___________________________________')
    print('Precision: \n {}'.format(precision_val))
    print('___________________________________')
    print('Recall: \n {}'.format(recall_val))
    print('___________________________________')
    print('Accuracy: \n {}'.format(accuracy_val))
    print('ROC-AUC value: {}'.format(auc_test))
    print('PR-AUC value: {}'.format(auc_precision_recall))
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.show()
    return value_dict

def auc_performance(y_test, y_pred,  value_dict):

    auc_test = roc_auc_score(y_test, y_pred)
    # Data to plot precision - recall curve
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)

    value_dict['roc-auc'].append(auc_test)
    value_dict['pr-auc'].append(auc_precision_recall)
    return value_dict

In [None]:
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df


def get_sha256_hash(x):
    m = hashlib.sha256()
    m.update(str(x).encode('utf-8'))
    return m.hexdigest().upper()


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    df_types = df.dtypes
    for col in df.columns:
        col_type = df[col].dtype
        # filter data types excluding object and datetime
        if (col_type != object) and (col_type != 'M8[ns]'):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(
                        np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) /
                                        start_mem))

    return df


def y_binary(y_pred, threshold):
    return (y_pred >= threshold).astype('int')


def model_performance(y_test, y_pred_bi, y_pred):

    conf_mat = pd.DataFrame(confusion_matrix(y_test, y_pred_bi),
                            columns=['Pred0', 'Pred1'],
                            index=['True0', 'True1'])
    precision_val = np.round(
        np.diagonal(conf_mat) / np.sum(conf_mat, axis=0) * 100, 2)
    recall_val = np.round(
        np.diagonal(conf_mat) / np.sum(conf_mat, axis=1) * 100, 2)
    accuracy_val = np.round(
        np.sum(np.diagonal(conf_mat)) / np.sum(conf_mat.values) * 100, 2)
    auc_test = roc_auc_score(y_test, y_pred)
    # Data to plot precision - recall curve
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)

    print('CONFUSION MATRIX: \n {}'.format(conf_mat))
    print('___________________________________')
    print('Precision: \n {}'.format(precision_val))
    print('___________________________________')
    print('Recall: \n {}'.format(recall_val))
    print('___________________________________')
    print('Accuracy: \n {}'.format(accuracy_val))
    print('ROC-AUC value: {}'.format(auc_test))
    print('PR-AUC value: {}'.format(auc_precision_recall))
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.show()
    
def model_performance_dictionary(y_test, y_pred_bi, y_pred, f_score, value_dict):

    conf_mat = pd.DataFrame(confusion_matrix(y_test, y_pred_bi),
                            columns=['Pred0', 'Pred1'],
                            index=['True0', 'True1'])
    
    precision_val = np.round(
        np.diagonal(conf_mat) / np.sum(conf_mat, axis=0) * 100, 2)
    recall_val = np.round(
        np.diagonal(conf_mat) / np.sum(conf_mat, axis=1) * 100, 2)
    accuracy_val = np.round(
        np.sum(np.diagonal(conf_mat)) / np.sum(conf_mat.values) * 100, 2)
    specificity = np.round(conf_mat.iloc[0,0]/(conf_mat.iloc[0,1]+conf_mat.iloc[0,0])*100,2)
    npv = np.round(conf_mat.iloc[0,0]/(conf_mat.iloc[1,0]+conf_mat.iloc[0,0])*100,2)
    auc_test = roc_auc_score(y_test, y_pred)
    # Data to plot precision - recall curve
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    # Use AUC function to calculate the area under the curve of precision recall curve
    auc_precision_recall = auc(recall, precision)
    value_dict['f_score'].append(f_score)
    value_dict['accuracy'].append(accuracy_val)
    value_dict['recall'].append(recall_val[1])
    value_dict['precision'].append(precision_val[1])  

    value_dict['specificity'].append(specificity)
    value_dict['NPV'].append(npv)

    return value_dict, conf_mat

In [None]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(f"[{re.escape(string.punctuation)}]", "",
                  text)  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens


def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

### Data loading

In [None]:
main_new_dataset = '../data/train_final_5f_all_labs_ext_med_hash'

# changing the exp_prefix directory below decides which features to load
# and where to save output files

# this variable is the column that we will use as the target variable for the model
target = 'INPT_DEATH_YN'
#target = 'AKIN_EVENT'

df = load_obj(main_new_dataset)


df.rename(columns={
    "OR_CASE_ID": "case_id",
    "PAT_ID": "patientid"
},
    inplace=True)

df["case_id"] = df["case_id"].apply(get_sha256_hash)
df = df.loc[:, ~df.columns.duplicated()]

In [None]:
df = reduce_mem_usage(df)

In [None]:
# Opening JSON file
f = open('feature_importance_final.txt')
  
# # returns JSON object as a dictionary
boruta_features = json.load(f)

In [None]:
df.iloc[:,838:].columns.tolist()

### Data cleaning

In [None]:
df['PRIM_SURG_PROV_ID'] = df['PRIM_SURG_PROV_ID'].replace('E1032',
                                                          1032).astype(int)
AKIN_THRESHOLD = 0
df['AKIN_EVENT'] = df['AKI_AKIN_CLASS'].apply(lambda val: 1
                                              if val > AKIN_THRESHOLD else 0)

In [None]:
print(df.INPT_DEATH_YN.value_counts(normalize=True))

print(df.AKIN_EVENT.value_counts(normalize=True))

In [None]:
newdf = df[(df["AGE_LT_90"] >=18) & (df["AGE_LT_90"] < 90)]
newdf = newdf[~newdf['ADMSN_SURGERY_NUMBER_W_ANES'].isna()]
newdf = newdf[~(newdf['ASA_STATUS']==6.0)]

## Feature selection for different models

### Model 1 

In [None]:
df_model = df.iloc[:, np.r_[0:58,964]].copy()

In [None]:
num_feat = [
    'encounter_id', 'patientid', 'ADMSN_ID', 'ASA_STATUS', 'CASE_START',
    'CASE_END', 'LAST_EF_RESULT_DATE', 'TOT_RBC', 'CRYSTALLOID_ML',
    'COLLOID_ML', 'FLOOR_2_ICU_YN', 'POSTOP_AKI_AKIN_CLASS', 'AKI_AKIN_CLASS',
    'AKIN_EVENT', 'INPT_DEATH_YN', 'PRIM_SURG_PROV_MINUTES', 'OPEN_ACCESS_YN',
    'GYN_ONC_ERAS_YN','HCUP_CODE'
]

cat_feat = [
    'case_id', 'SEX', 'LAST_EF', 'HCUP_DESC', 'PRIMARY_CPT', 'CPT_DESC',
    'DATE_OF_SERVICE', 'CASE_SRV_NAME', 'CASE_SRV_NAME_GROUP','GROUP_AGE'
]

### Model 2 

In [None]:
df_base = df[['DATE_OF_SERVICE', 'AKI_AKIN_CLASS', 'INPT_DEATH_YN']].copy()

In [None]:
df_labs = df.iloc[:, np.r_[58:835,964]].copy()


In [None]:
df_model = pd.concat([df_base, df_labs], axis=1)

In [None]:
num_feat = ['AKI_AKIN_CLASS', 'AKIN_EVENT', 'INPT_DEATH_YN']

cat_feat = ['DATE_OF_SERVICE']

### Model 3

In [None]:
df_base = df[['DATE_OF_SERVICE', 'AKI_AKIN_CLASS', 'INPT_DEATH_YN']].copy()

In [None]:
df_proc_name = df.iloc[:, np.r_[835:836,964]].copy()

In [None]:
df_model = pd.concat([df_base, df_proc_name], axis=1)

In [None]:
num_feat = ['AKI_AKIN_CLASS', 'AKIN_EVENT', 'INPT_DEATH_YN']

cat_feat = ['DATE_OF_SERVICE', 'PROC_NAME']

In [None]:
len(df_model.PROC_NAME.unique())

### Model 4

In [None]:
df_base = df[['DATE_OF_SERVICE', 'AKI_AKIN_CLASS', 'INPT_DEATH_YN']].copy()

In [None]:
df_med = df.iloc[:, 838:965].copy()

In [None]:
df_model = pd.concat([df_base, df_med], axis=1)

In [None]:
num_feat = ['AKI_AKIN_CLASS', 'AKIN_EVENT', 'INPT_DEATH_YN']

cat_feat = ['DATE_OF_SERVICE']

### Model 5

In [None]:
df_model = df.iloc[:, np.r_[0:58,576:834,964]].copy()

num_feat = [
    'encounter_id', 'patientid','ADMSN_ID', 'ASA_STATUS', 'CASE_START', 'CASE_END',
    'LAST_EF_RESULT_DATE', 'TOT_RBC', 'CRYSTALLOID_ML', 'COLLOID_ML',
    'FLOOR_2_ICU_YN', 'POSTOP_AKI_AKIN_CLASS', 'AKI_AKIN_CLASS', 'AKIN_EVENT',
    'INPT_DEATH_YN', 'PRIM_SURG_PROV_MINUTES', 'OPEN_ACCESS_YN',
    'GYN_ONC_ERAS_YN','HCUP_CODE'
]

cat_feat = [
    'case_id', 'SEX', 'LAST_EF', 'HCUP_DESC', 'PRIMARY_CPT', 'CPT_DESC',
    'DATE_OF_SERVICE', 'CASE_SRV_NAME', 'GROUP_AGE'
]

### Model 6 

In [None]:
df_model = df.iloc[:, np.r_[0:58, 835:837,964]].copy()

num_feat = [
    'encounter_id', 'patientid','ADMSN_ID', 'ASA_STATUS', 'CASE_START', 'CASE_END',
    'LAST_EF_RESULT_DATE', 'TOT_RBC', 'CRYSTALLOID_ML', 'COLLOID_ML',
    'FLOOR_2_ICU_YN', 'POSTOP_AKI_AKIN_CLASS', 'AKI_AKIN_CLASS', 'AKIN_EVENT',
    'INPT_DEATH_YN', 'PRIM_SURG_PROV_MINUTES', 'OPEN_ACCESS_YN',
    'GYN_ONC_ERAS_YN'
]

cat_feat = [
    'case_id', 'SEX', 'LAST_EF', 'HCUP_DESC', 'PRIMARY_CPT', 'CPT_DESC',
    'DATE_OF_SERVICE', 'CASE_SRV_NAME', 'GROUP_AGE', 'PROC_NAME'
]

### Model 7

In [None]:
df_model = df.iloc[:, np.r_[0:58, 837:965]].copy()

In [None]:
num_feat = [
    'encounter_id','patientid', 'ADMSN_ID', 'ASA_STATUS', 'CASE_START', 'CASE_END',
    'LAST_EF_RESULT_DATE', 'TOT_RBC', 'CRYSTALLOID_ML', 'COLLOID_ML',
    'FLOOR_2_ICU_YN', 'POSTOP_AKI_AKIN_CLASS', 'AKI_AKIN_CLASS', 'AKIN_EVENT',
    'INPT_DEATH_YN', 'PRIM_SURG_PROV_MINUTES', 'OPEN_ACCESS_YN',
    'GYN_ONC_ERAS_YN'
]

cat_feat = [
    'case_id', 'SEX', 'LAST_EF', 'HCUP_DESC', 'PRIMARY_CPT', 'CPT_DESC',
    'DATE_OF_SERVICE', 'CASE_SRV_NAME', 'GROUP_AGE'
]

### Model 8&9

In [None]:
df_model = df.copy()

In [None]:
num_feat = [
    'encounter_id', 'patientid','ADMSN_ID', 'ASA_STATUS', 'CASE_START', 'CASE_END',
    'LAST_EF_RESULT_DATE', 'TOT_RBC', 'CRYSTALLOID_ML', 'COLLOID_ML',
    'FLOOR_2_ICU_YN', 'POSTOP_AKI_AKIN_CLASS', 'AKI_AKIN_CLASS', 'AKIN_EVENT',
    'INPT_DEATH_YN', 'PRIM_SURG_PROV_MINUTES', 'OPEN_ACCESS_YN',
    'GYN_ONC_ERAS_YN', 'or_case_id'
]

cat_feat = [
    'case_id', 'SEX', 'LAST_EF', 'HCUP_DESC', 'PRIMARY_CPT', 'CPT_DESC',
    'DATE_OF_SERVICE', 'CASE_SRV_NAME', 'GROUP_AGE', 'PROC_NAME'
]

In [None]:
df_test.DATE_OF_SERVICE.max()

### Split data into train and test

In [None]:
df_train = df_model[df_model['DATE_OF_SERVICE'] < '2019-01-01']
df_test = df_model[df_model['DATE_OF_SERVICE'] >= '2019-01-01']

In [None]:
patientid_test = set(df_test['patientid'])

In [None]:
df_train = df_train[~df_train['patientid'].isin(patientid_test)]

In [None]:
df_train['ADMSN_ID'].dropna(inplace=True)
df_test['ADMSN_ID'].dropna(inplace=True)

In [None]:
df_describe = df_model.describe()
ci = []
for i in df_describe:
    tmp = df_describe[(df_mean[i] != inf_val)&(df_describe[i] != inf_val_neg)]
    interval = st.t.interval(0.95, len(tmp[i])-1, 
                  loc=np.nanmean(tmp[i]), scale=st.sem(tmp[i],nan_policy='omit'))
    round_ci = [round(num,3) for num in list(interval)]
    ci.append(round_ci)

### Data preparation

In [None]:
numeric_features = df_model.select_dtypes(include='number').drop(
    labels = num_feat, axis=1).columns

In [None]:
cat_features = df_model.select_dtypes(include=['object', 'category']).drop(
    labels = cat_feat, axis=1).columns

In [None]:
X_train = df_train
y_train = df_train['INPT_DEATH_YN'].values
#y_train = df_train['AKIN_EVENT'].values

In [None]:
start_time = timeit.default_timer()

# Fit One Hot Encoder using default spase matrix
encoder = OneHotEncoder(sparse=False)
encoder.fit(X_train[cat_features])

elapsed = timeit.default_timer() - start_time

print('Processing time', elapsed, 'seconds')

X_train_sparse = encoder.transform(X_train[cat_features])
elapsed = timeit.default_timer() - start_time

print('Processing time', elapsed, 'seconds')

In [None]:
X_sparse = pd.DataFrame(X_train_sparse,
                          columns=encoder.get_feature_names(cat_features))

In [None]:
# # #Uncomment for word embeddings (Models 3,6,8,9)
# custom_stopwords = set(
#     stopwords.words("english") + ["than", "to", "and", "or", "of"])

# data = X_train.copy()

# data["tokens"] = data["PROC_NAME"].map(
#     lambda x: clean_text(x, word_tokenize, custom_stopwords))

# # Remove empty values and keep relevant columns
# data = data.loc[data.tokens.map(lambda x: len(x) > 0), ["PROC_NAME", "tokens"]]

# print('Original dataframe: {}'.format(X_train.shape))
# print('Pre-processed dataframe: {}'.format(data.shape))

In [None]:
# # Uncomment for word embeddings (Models 3,6,8,9)
# docs = data["PROC_NAME"].values
# tokenized_docs = data['tokens'].values
# vocab = Counter()
# for token in tokenized_docs:
#     vocab.update(token)

In [None]:
# # Uncomment for word embeddings (Models 3,6,8,9)
# # #model_word2vec = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=5, seed=42)
# model_word2vec = load_obj('../word2vec_model3_new_100')
# vectorized_docs = vectorize(tokenized_docs, model=model_word2vec)

# X_train = X_train.join(
#     pd.DataFrame(vectorized_docs,
#                  columns=['proccode_{}'.format(col) for col in range(0, 100)]))

# proc_name = ['proccode_{}'.format(col) for col in range(0, 100)]

In [None]:
# Uncomment for word embeddings (Models 3,6,8,9)
X_train_enc = np.hstack((X_train[numeric_features], X_train_sparse))
# X_train_enc = np.hstack((X_train_enc, X_train[proc_name]))

In [None]:
# Uncomment for word embeddings (Models 3,6,8,9)
feature_names = list(numeric_features.astype(str)) + list(
    encoder.get_feature_names(cat_features)) #+ list(proc_name)

In [None]:
X_train_enc = pd.DataFrame(X_train_enc, columns=feature_names)

In [None]:
# Uncomment for feature selection (Model 9)
# feature_names = boruta_features

In [None]:
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
feature_names = [
    regex.sub("_", col) if any(x in str(col)
                               for x in set(('[', ']', '<'))) else col
    for col in feature_names
]
X_train_enc = pd.DataFrame(X_train_enc[feature_names],
                     columns=feature_names)

### Model training

In [None]:
dtrain = xgb.DMatrix(data=csr_matrix(X_train_enc),
                     feature_names=feature_names,
                     label=y_train,
                     nthread=-1)

In [None]:
best_param = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'eval_metric': 'aucpr',
    'seed': 1,
    'verbosity': 1,
    'max_depth': 12,
    'min_child_weight': 5,
    'eta': 0.1,
    'gamma': 0.7,
    'subsample': 0.8,
    'colsample_bytree': 1,
    'nthread': 50
}

In [None]:
model = xgb.train(params=best_param, dtrain=dtrain, num_boost_round=100)

In [None]:
X_test = df_test
X_test['GENDER'].replace('U', 'M', inplace=True)
X_test['ANES_TYPE_HANDOFF'].replace('L&amp;D NITROUS', np.nan, inplace=True)
y_test = df_test['INPT_DEATH_YN'].values
#y_test = df_test['AKIN_EVENT'].values

X_test_sparse = encoder.transform(X_test[cat_features])

In [None]:
save_obj(model,'model/xgboost_model1_mort')

In [None]:
X_test = df_test
X_test['GENDER'].replace('U', 'M', inplace=True)
X_test['ANES_TYPE_HANDOFF'].replace('L&amp;D NITROUS', np.nan, inplace=True)
y_test = df_test['INPT_DEATH_YN'].values
#y_test = df_test['AKIN_EVENT'].values

X_test_sparse = encoder.transform(X_test[cat_features])

In [None]:
X_test_enc = np.hstack((X_test[numeric_features], X_test_sparse))

In [None]:
# # # Uncomment for word embeddings (Models 3,6,8,9)
# data = X_test.copy()

# data["tokens"] = data["PROC_NAME"].map(
#     lambda x: clean_text(x, word_tokenize, custom_stopwords))

# # Remove empty values and keep relevant columns
# data = data.loc[data.tokens.map(lambda x: len(x) > 0), ["PROC_NAME", "tokens"]]

# docs = data["PROC_NAME"].values
# tokenized_docs = data['tokens'].values

In [None]:
# # Uncomment for word embeddings (Models 3,6,8,9)
# vectorized_docs_test = vectorize(tokenized_docs, model=model_word2vec)
# X_test = X_test.join(
#     pd.DataFrame(vectorized_docs_test,
#                  columns=['proccode_{}'.format(col) for col in range(0, 100)]))

In [None]:
# Uncomment for word embeddings (Models 3,6,8,9)
feature_names = list(numeric_features.astype(str)) + \
    list(encoder.get_feature_names(cat_features))#+list(proc_name)
#X_test_enc = np.hstack((X_test_enc, X_test[proc_name]))
X_test_enc = pd.DataFrame(X_test_enc, columns=feature_names)

In [None]:
# Uncomment for feature selection (Model 9) + comment 2nd dtest initialization
#feature_names= boruta_features
#dtest = xgb.DMatrix(data=csr_matrix(X_test_enc[feature_names]),feature_names = feature_names)

In [None]:
dtest = xgb.DMatrix(data=csr_matrix(X_test_enc),
                     feature_names=feature_names)
y_pred = model.predict(dtest)

### Model Scoring

In [None]:
results_f1 = {'f_score':[],'accuracy':[],'recall':[],'precision':[],'specificity':[],'NPV':[]}
results_f2 = {'f_score':[],'accuracy':[],'recall':[],'precision':[],'specificity':[],'NPV':[]}
results_f3 = {'f_score':[],'accuracy':[],'recall':[],'precision':[],'specificity':[],'NPV':[]}
n =1000
trials = 50
# cache for performance
unique_pat_ids = X_test["patientid"].unique()
auc_dict = {'roc-auc':[],
            'pr-auc':[]}

for trial in tqdm(range(trials)):
    #tmp = np.random.choice(X_test.index,size=n)
    #sample = X_test[X_test.index.isin(tmp)].copy()#X_test.iloc[tmp,:].copy()
    tmp = np.random.choice(unique_pat_ids,size=n)
    sample = X_test.loc[X_test.apply(lambda x: x.patientid in tmp, axis=1)].copy()
    y_test = sample['INPT_DEATH_YN'].values
    #y_test = sample['AKIN_EVENT'].values
    
    data = sample.copy()

#     data["tokens"] = data["PROC_NAME"].map(
#         lambda x: clean_text(x, word_tokenize, custom_stopwords))

#     # Remove empty values and keep relevant columns
#     data = data.loc[data.tokens.map(lambda x: len(x) > 0), ["PROC_NAME", "tokens"]]

#     docs = data["PROC_NAME"].values
#     tokenized_docs = data['tokens'].values
#     vectorized_docs_test = vectorize(tokenized_docs, model=model_word2vec)
#     sample = sample.join(
#         pd.DataFrame(vectorized_docs_test,
#                      columns=['proccode_{}'.format(col) for col in range(0, 100)]))
    

    X_test_sparse = encoder.transform(sample[cat_features],)
    X_test_enc = np.hstack((sample[numeric_features], X_test_sparse))
    
    feature_names = list(numeric_features.astype(str)) + \
    list(encoder.get_feature_names(cat_features))#+list(proc_name)
    #X_test_enc = np.hstack((X_test_enc, sample[proc_name]))
    X_test_enc = pd.DataFrame(X_test_enc, columns=feature_names)
    
    #feature_names= boruta_features
    #dtest = xgb.DMatrix(data=csr_matrix(X_test_enc[feature_names]),feature_names = feature_names)

    
    dtest = xgb.DMatrix(data=csr_matrix(X_test_enc),
                     feature_names=feature_names)
    y_pred = model6.predict(dtest)
    
    thresholds = np.arange(0, 1, 0.01)
    
    scores = [f1_score(y_test, y_binary(y_pred, t)) for t in thresholds]
    scores_beta_1 = [
        fbeta_score(y_test, y_binary(y_pred, t), beta=2.5)
        for t in thresholds
    ]
    scores_beta_2 = [
        fbeta_score(y_test, y_binary(y_pred, t), beta=3.0)
        for t in thresholds
    ]

    ix_f1 = np.argmax(scores)
    ix_fbeta_1 = np.argmax(scores_beta_1)
    ix_fbeta_2 = np.argmax(scores_beta_2)
    #print('Model performance with the optimal threshold for F1')
    y_pred_binary_f1 = [1 if x >= thresholds[ix_f1] else 0 for x in y_pred]
    results_f1,cm1 = model_performance(y_test, y_pred_binary_f1, y_pred,scores[ix_f1],results_f1)
    #print('______________________________')
    #print('Model performance with the optimal threshold for Fbeta (beta=2.0)')
    y_pred_binary_f2 = [1 if x >= thresholds[ix_fbeta_1] else 0 for x in y_pred]
    results_f2,cm2 = model_performance(y_test, y_pred_binary_f2, y_pred, scores_beta_1[ix_fbeta_1], results_f2)
    #print('______________________________')
    #print('Model performance with the optimal threshold for Fbeta (beta=3.0)')
    y_pred_binary_f3 = [1 if x >= thresholds[ix_fbeta_2] else 0 for x in y_pred]
    results_f3,cm3 = model_performance(y_test, y_pred_binary_f3, y_pred,scores_beta_2[ix_fbeta_2], results_f3)
    #auc_dict = auc_performance(y_test,y_pred, auc_dict)

In [None]:
X_test_enc_1 = pd.DataFrame(X_train_enc, columns=feature_names)
explainer = shap.Explainer(model)
shap_values = explainer.shap_values(X_train_enc)
shap.summary_plot(shap_values,
                  X_test_enc_1,
                  feature_names=feature_names,
                  plot_type="bar",
                  max_display=15)

### Save models

In [None]:
save_obj(model,'../models/main/xgboost_model_tune')