In [1]:
# importar librerias
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")

import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import joblib

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram, plot_evaluations

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, f1_score, accuracy_score, auc, balanced_accuracy_score, plot_confusion_matrix

import shap


def Interval(y_test, y_model, alpha = 5.0): #alpha = 5.0
    # Realiza 100 muestreos de 1.000 datos para el calculo de metricas y su intervalo de confianza
    roc_scores = list()
    PR_scores = list()
    for _ in range(100):
        y_sample = y_test.sample(10000)
        roc = roc_auc_score(y_sample, y_model.filter(items = y_sample.index, axis=0))
        roc_scores.append(roc)
        lr_precision, lr_recall, _ = precision_recall_curve(y_sample, y_model.filter(items = y_sample.index, axis=0))
        lr_auc = auc(lr_recall, lr_precision) 
        PR_scores.append(lr_auc)
    
    lower_p = alpha / 2.0
    upper_p = (100 - alpha) + (alpha / 2.0)
    lower_roc = max(0.0, np.percentile(roc_scores, lower_p))
    upper_roc = min(1.0, np.percentile(roc_scores, upper_p))
    t1 = '({0:.2f}, {1:.2f})'.format(lower_roc, upper_roc)
    lower_lr = max(0.0, np.percentile(lr_auc, lower_p))
    upper_lr = min(1.0, np.percentile(lr_auc, upper_p))
    t2 = '({0:.2f}, {1:.2f})'.format(lower_lr, upper_lr)
    return t1#, t2

def plot_ROC_AUC(model, X_test, y_test):
    ns_probs = [0 for _ in range(len(y_test))]
    lr_probs = model.predict_proba(X_test)[:, 1]
    ns_auc = roc_auc_score(y_test, ns_probs)
    lr_auc = roc_auc_score(y_test, lr_probs)
    ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
    lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
    
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.plot(ns_fpr, ns_tpr, linestyle='--', label='Curva ROC')
    ax.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
    ax.set_xlabel('Tasa de falsos positivos')
    ax.set_ylabel('Tasa de verdaderos positivos')
    ax.set_title('Área bajo la curva ROC')
    ax.legend()
    
def plot_AUC_PR(model, X_test, y_test):
    lr_probs = model.predict_proba(X_test)[:, 1]
    lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
    lr_f1, lr_auc = f1_score(y_test, y_model), auc(lr_recall, lr_precision)
    
    print('Logistic: f1=%.3f auc=%.3f' % (lr_f1, lr_auc))
    
    no_skill = len(y_test[y_test==1]) / len(y_test)
    
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
    ax.plot(lr_recall, lr_precision, marker='.', label='Logistic')    
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_title('Área bajo la curva Precision-Recall')
    ax.legend()
    
def plot_AUC_ROC_PR(name, model, X_test, y_test):
    y_model = model.predict(X_test)
    roc_score = roc_auc_score(y_test, y_model)
    ns_probs = [0 for _ in range(len(y_test))]
    lr_probs = model.predict_proba(X_test)[:, 1]
    ns_auc = roc_auc_score(y_test, ns_probs)
    lr_auc = roc_auc_score(y_test, lr_probs)
    ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
    lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
    lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
    lr_f1, lr_auc = f1_score(y_test, y_model), auc(lr_recall, lr_precision)    
    no_skill = len(y_test[y_test==1]) / len(y_test)
    
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
    ax[0].plot(ns_fpr, ns_tpr, linestyle='--')
    ax[0].plot(lr_fpr, lr_tpr, marker='.', label="ROC curve (area = %0.2f)" % roc_score)
    ax[0].set_xlabel('Tasa de falsos positivos')
    ax[0].set_ylabel('Tasa de verdaderos positivos')
    ax[0].set_title('Área bajo la curva ROC')
    ax[0].legend()
    
    ax[1].plot([0, 1], [no_skill, no_skill], linestyle='--')
    ax[1].plot(lr_recall, lr_precision, marker='.', label="PR curve (area = %0.2f)" % lr_auc)    
    ax[1].set_xlabel('Recall')
    ax[1].set_ylabel('Precision')
    ax[1].set_title('Área bajo la curva Precision-Recall')
    ax[1].legend()
    fig.savefig('plots/'+name+'.png')
    return roc_score, lr_auc    

In [2]:
df = pd.read_csv('data/Datos_listos_text_transfer_label.csv',
                 dtype={'subject_id':object,
                        'hadm_id':object,
                        'stay_id':object,
                        'stay_m':'uint16',
                        'Admin_Hospi':'int8',
                        'temperature':'float32',
                        'heartrate':'float32',
                        'resprate':'uint8',
                        'o2sat':'float32',
                        'sbp':'uint8',
                        'dbp':'uint8',
                        'pain':'float32',
                        'gender':'int8',
                        'Age':'uint8',
                        'acuity':'uint8',
                        'temperature_cat':'category',
                        'heartrate_cat':'category', 
                        'resprate_cat':'category',
                        'o2sat_cat':'category',
                        'Presión arterial':'category',
                        'Reingreso_24':'int8',
                        'Reingreso_48':'int8',
                        'Reingreso_72':'int8',
                        'Reingresos_menores_72':'int8',
                        'Hospitalization':'uint8',
                        'ICU':'uint8',
                        'Surgery':'uint8'},
                                         #'acuity':'category'}#, 
                 parse_dates=['intime', 'outtime']
                )
df.set_index('stay_id', inplace=True)
df.dropna(inplace=True) #1330
df.shape #386554
df['Destino'] = "Ambulatorio"
df.loc[df['ICU']==1 , 'Destino'] = "ICU"
df.loc[df['Hospitalization']==1 , 'Destino'] = "Hospitalization"
df.loc[df['Surgery']==1 , 'Destino'] = "Surgery"
df['Destino'] = df['Destino'].astype('category')
df['Destino'] = df['Destino'].cat.codes
df['Destino'].value_counts()
df['temperature_dummy'] = 0
df.loc[(df['temperature']<96.1)|(df['temperature']>99.2), 'temperature_dummy'] = 1
df['heartrate_dummy'] = 0
df.loc[(df['heartrate']<60)|(df['heartrate']>104), 'heartrate_dummy'] = 1
df['resprate_dummy'] = 0
df.loc[(df['resprate']<15)|(df['resprate']>19), 'resprate_dummy'] = 1
df['o2sat_dummy'] = 0
df.loc[(df['o2sat']<95), 'o2sat_dummy'] = 1
df['Hora_in'] = df['intime'].dt.hour

MM = MinMaxScaler()
df[['temperature_MM', 'heartrate_MM',
    'resprate_MM', 'o2sat_MM',
    'sbp_MM', 'dbp_MM',
    'pain_MM', 'Age_MM']] = MM.fit_transform(df[['temperature', 'heartrate',
                                                 'resprate', 'o2sat',
                                                 'sbp', 'dbp',
                                                 'pain', 'Age']])

X = pd.merge(df[['temperature_MM', 'heartrate_MM',
                   'resprate_MM', 'o2sat_MM',
                   'sbp_MM', 'dbp_MM',
                   'pain_MM', 'Age_MM',
                   'gender', 'Reingresos_menores_72', #'Hora_in',
                   'temperature_dummy', 'heartrate_dummy', 'resprate_dummy', 'o2sat_dummy']],
               pd.get_dummies(df['Presión arterial']).drop(columns=['Normal']),
               left_index=True, right_index=True)
X['anomalos'] = X[['temperature_dummy', 'heartrate_dummy',
                   'resprate_dummy', 'o2sat_dummy', 'Baja',
                   'Elevada', 'Estadio 1', 'Estadio 2']].sum(axis=1)
X['presure_dummy'] = X[['Baja','Elevada', 'Estadio 1', 'Estadio 2']].sum(axis=1)
X.drop(columns=['Baja','Elevada', 'Estadio 1', 'Estadio 2'], inplace=True)
X.shape    

(386554, 16)

In [5]:
%%time
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=(1,1),
                        stop_words=None,
                        lowercase=False,
                        max_df=1.,
                        min_df=10,
                        max_features=200,
                        norm='l2',
                        sublinear_tf=True)
# se aplica el TFIDF
features = tfidf.fit_transform(df['chiefcomplaint']).toarray()
X = pd.DataFrame(features, index= df.index, columns=tfidf.get_feature_names())
X.shape

CPU times: total: 2.58 s
Wall time: 2.63 s


(386554, 200)

In [6]:
X_0 = pd.merge(df[['temperature_MM', 'heartrate_MM',
                   'resprate_MM', 'o2sat_MM',
                   'sbp_MM', 'dbp_MM',
                   'pain_MM', 'Age_MM',
                   'gender', 'Reingresos_menores_72', #'Hora_in',
                   'temperature_dummy', 'heartrate_dummy', 'resprate_dummy', 'o2sat_dummy']],
               pd.get_dummies(df['Presión arterial']).drop(columns=['Normal']),
               left_index=True, right_index=True)
X_0['anomalos'] = X_0[['temperature_dummy', 'heartrate_dummy',
                   'resprate_dummy', 'o2sat_dummy', 'Baja',
                   'Elevada', 'Estadio 1', 'Estadio 2']].sum(axis=1)
X_0['presure_dummy'] = X_0[['Baja','Elevada', 'Estadio 1', 'Estadio 2']].sum(axis=1)
X_0.drop(columns=['Baja','Elevada', 'Estadio 1', 'Estadio 2'], inplace=True)

X = pd.merge(X_0, pd.DataFrame(features, index= df.index, columns=tfidf.get_feature_names()),
             left_index=True, right_index=True)
X.shape

(386554, 216)

In [8]:
df['acuity'] = df['acuity'].astype('category')
X_0 = pd.merge(df[['acuity', 
                   'temperature_MM', 'heartrate_MM',
                   'resprate_MM', 'o2sat_MM',
                   'sbp_MM', 'dbp_MM',
                   'pain_MM', 'Age_MM',
                   'gender', 'Reingresos_menores_72', #'Hora_in',
                   'temperature_dummy', 'heartrate_dummy', 'resprate_dummy', 'o2sat_dummy']],
               pd.get_dummies(df['Presión arterial']).drop(columns=['Normal']),
               left_index=True, right_index=True)
X_0['anomalos'] = X_0[['temperature_dummy', 'heartrate_dummy',
                   'resprate_dummy', 'o2sat_dummy', 'Baja',
                   'Elevada', 'Estadio 1', 'Estadio 2']].sum(axis=1)
X_0['presure_dummy'] = X_0[['Baja','Elevada', 'Estadio 1', 'Estadio 2']].sum(axis=1)
X_0.drop(columns=['Baja','Elevada', 'Estadio 1', 'Estadio 2'], inplace=True)

X = pd.merge(X_0, pd.DataFrame(features, index= df.index, columns=tfidf.get_feature_names()),
             left_index=True, right_index=True)
X.shape

(386554, 217)

In [9]:
ys = ['ICU', 'Surgery', 'Hospitalization', 'Admin_Hospi']
casos = ['0', '2', '3']
for j in ys:
    print(j)
    y = df[j]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    #if j == 'Admin_Hospi': j = 'Admin'
    for i in casos:
        LGBM = joblib.load('models/LR_Numericas_text_numbers_acuity_'+j+i+'.joblib')
        y_pred = pd.Series(LGBM.predict_proba(X_test)[:, 1], index=X_test.index)
        y_model = LGBM.predict(X_test)
        roc = roc_auc_score(y_test, y_pred)
        t1 = Interval(y_test, y_pred, alpha = 5.0)
        record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
        run = {'Fecha_hora':pd.to_datetime('today'),
               'Tamaño_test': X_test.shape, #i,
               'Dependiente':j,
               'Modelo': 'LogisticRegression_text_numbers_acuity_'+j+i,#u,
               'Parametros': 'NA',#LGBM.params_,
               'ROC_AUC_train': 'NA',#LGBM.score_,
               'ROC_AUC_Score': roc,
               'Intervalo_ROC_AUC':t1, 
               'PR_AUC_Score': 0,
               'Intervalo_PR_AUC': 0,
               'f1_score': f1_score(y_test, y_model), 
               'Accuracy': accuracy_score(y_test, y_model),
               'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
               'vars':', '.join(X_train.columns)
              }
        record = record.append(run, ignore_index=True)
        record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)

ICU
Surgery
Hospitalization
Admin_Hospi


In [None]:
%%time
ys = [ 'Surgery', 'ICU']#ICU
for j in ys:
    y = df[df['Admin_Hospi']==1][j]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    print(X_train.shape)
    print(X_test.shape)
    for i in range(4):
        try:
            model = BayesSearchCV(LogisticRegression(),
                                  espacio[i],
                                  scoring='roc_auc',
                                  n_points= 5,
                                  n_iter= 50,
                                  cv= 5,
                                  verbose = 0,
                                  n_jobs=6,
                                  random_state= 88)
            model.fit(X_train, y_train)
            clf = model.best_estimator_
            y_model = pd.Series(clf.predict(X_test), index=X_test.index)
            roc, pr = plot_AUC_ROC_PR('LogisticRegression_text_numbers_acuity_ofAdmin_'+j+str(i), clf, X_test, y_test)
            t1, t2 = Interval(y_test, y_model, alpha = 5.0)
            print(t1, t2)
            record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
            run = {'Fecha_hora':pd.to_datetime('today'),
                   'Tamaño_test': X_test.shape, #i,
                   'Dependiente':j,
                   'Modelo': 'LogisticRegression_text_numbers_acuity_ofAdmin'+str(i),#u,
                   'Parametros': model.best_params_,
                   'ROC_AUC_train': model.best_score_,
                   'ROC_AUC_Score': roc,
                   'Intervalo_ROC_AUC':t1, 
                   'PR_AUC_Score': pr,
                   'Intervalo_PR_AUC':t2,
                   'f1_score': f1_score(y_test, y_model), 
                   'Accuracy': accuracy_score(y_test, y_model),
                   'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
                   'vars':', '.join(X_train.columns)
                  }
            joblib.dump(clf, 'models/LR_Numericas_text_numbers_acuity_ofAdmin_'+j+str(i)+'.joblib')
            record = record.append(run, ignore_index=True)
            record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
            plot_confusion_matrix(clf, X_test, y_test, normalize='true')
        except:
            print('se presentó un error')

(121202, 217)
(51945, 217)
(0.50, 0.50) (0.54, 0.54)


In [None]:
%%time
y = df['Destino']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
model = BayesSearchCV(LGBMClassifier(),
                      espacio,
                      scoring='accuracy',
                      n_points= 5,
                      n_iter= 50,
                      cv= 5,
                      verbose = 0,
                      n_jobs=6,
                      random_state= 88)
model.fit(X_train, y_train)
clf = model.best_estimator_
y_model = pd.Series(clf.predict(X_test), index=X_test.index)
roc, pr = plot_AUC_ROC_PR('LGBMClassifier_Destino_text_numbers_acuity', clf, X_test, y_test)
t1, t2 = Interval(y_test, y_model, alpha = 5.0)
print(t1, t2)
record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
run = {'Fecha_hora':pd.to_datetime('today'),
       'Tamaño_test': X_test.shape, #i,
       'Dependiente':'Destino',
       'Modelo': 'LGBMClassifier_text_numbers_acuity',#u,
       'Parametros': model.best_params_,
       'ROC_AUC_train': model.best_score_,
       'ROC_AUC_Score': roc,
       'Intervalo_ROC_AUC':t1, 
       'PR_AUC_Score': pr,
       'Intervalo_PR_AUC':t2,
       'f1_score': f1_score(y_test, y_model), 
       'Accuracy': accuracy_score(y_test, y_model),
       'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
       'vars':', '.join(X_train.columns)
      }
joblib.dump(clf, 'models/LGBM_text_numbers_acuity_Destino.joblib')
record = record.append(run, ignore_index=True)
record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
plot_confusion_matrix(clf, X_test, y_test, normalize='true')

In [None]:
multi_class = 'ovr'

In [None]:
%%time
y = df['Destino']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
model = BayesSearchCV(LGBMClassifier(),
                      espacio,
                      scoring='balanced_accuracy',
                      n_points= 5,
                      n_iter= 50,
                      cv= 5,
                      verbose = 0,
                      n_jobs=6,
                      random_state= 88)
model.fit(X_train, y_train)
clf = model.best_estimator_
y_model = pd.Series(clf.predict(X_test), index=X_test.index)
roc, pr = plot_AUC_ROC_PR('LGBMClassifier_Destino_text_numbers_acuity', clf, X_test, y_test)
t1, t2 = Interval(y_test, y_model, alpha = 5.0)
print(t1, t2)
record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
run = {'Fecha_hora':pd.to_datetime('today'),
       'Tamaño_test': X_test.shape, #i,
       'Dependiente':'Destino',
       'Modelo': 'LGBMClassifier_text_numbers_acuity',#u,
       'Parametros': model.best_params_,
       'ROC_AUC_train': model.best_score_,
       'ROC_AUC_Score': roc,
       'Intervalo_ROC_AUC':t1, 
       'PR_AUC_Score': pr,
       'Intervalo_PR_AUC':t2,
       'f1_score': f1_score(y_test, y_model), 
       'Accuracy': accuracy_score(y_test, y_model),
       'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
       'vars':', '.join(X_train.columns)
      }
joblib.dump(clf, 'models/LGBM_text_numbers_acuity_Destino.joblib')
record = record.append(run, ignore_index=True)
record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
plot_confusion_matrix(clf, X_test, y_test, normalize='true')

In [None]:
espacio = {"n_estimators": Integer(10, 3000),
           'max_depth':Integer(1, 40),
           'num_leaves': Integer(2, 500),
           'learning_rate': Real(0.0001, 0.3, prior='uniform'),
           'lambda_l1': Real(1e-8, 10.0, prior='log-uniform'),
           'lambda_l2': Real(1e-8, 10.0, prior='log-uniform'),
           'num_leaves': Integer(2, 256),
           'feature_fraction': Real(0.4, 1.0, prior='uniform'),
           'bagging_fraction': Real(0.4, 1.0, prior='uniform'),
           'bagging_freq': Integer(1, 7),
           'min_child_samples': Integer(5, 100),
           'subsample':Real(0.2, 1, prior='uniform')}

In [None]:
import optuna
import lightgbm as lgb
y = df['Admin_Hospi']
def objective(trial):
    #data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25)
    dtrain = lgb.Dataset(train_x, label=train_y)
 
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        "n_estimators": trial.suggest_int('n_estimators', 10, 3000),
        'max_depth':trial.suggest_int('max_depth', 1, 40),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-10, 1),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
 
    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(test_x)
    pred_labels = np.rint(preds)
    roc_auc = roc_auc_score(test_y, pred_labels)
    return roc_auc
 
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best score:', study.best_trial.values)

In [None]:
import optuna.integration.lightgbm as lgb
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25)
dtrain = lgb.Dataset(train_x, label=train_y)
dval = lgb.Dataset(test_x, label=test_y)

params = {
        "objective": "binary",
        "metric": "binary_logloss",
    } 

best_params, tuning_history = dict(), list()
booster = lgb.train(params, dtrain, valid_sets=dval,
                    verbose_eval=0,
                    best_params=best_params,
                    tuning_history=tuning_history)
 
print('Best Params:', best_params)
print('Tuning history:', tuning_history)

In [None]:
def test_run() -> None:
    params = {
        "objective": "binary",
        "metric": "binary_logloss",
    }  # type: Dict
    dtrain = lgb.Dataset(train_x, label=train_y)
    dval = lgb.Dataset(test_x, label=test_y)

    study = optuna.create_study()
    tuner = lgb.LightGBMTuner(params,
                              dtrain,
                              study=study,
                              valid_sets=dval,
                              verbose_eval=1,
                              early_stopping_rounds=1,
                              num_boost_round=2)
    tuner.run()
    df_trials = study.trials_dataframe()
    assert len(df_trials) == 68
    assert len(df_trials[df_trials["state"] == "COMPLETE"]) == 68

In [None]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 7,
    'metric': ['multi_error'],
    "learning_rate": 0.05,
     "num_leaves": 60,
     "max_depth": 9,
     "feature_fraction": 0.45,
     "bagging_fraction": 0.3,
     "reg_alpha": 0.15,
     "reg_lambda": 0.15,
#      "min_split_gain": 0,
      "min_child_weight": 0
                }

In [None]:
modelstart= time.time()
# Find Optimal Parameters / Boosting Rounds
lgb_cv = lgb.cv(
    params = lgbm_params,
    train_set = lgtrain,
    num_boost_round=2000,
    stratified=True,
    nfold = 5,
    verbose_eval=50,
    seed = 23,
    early_stopping_rounds=75)

loss = lgbm_params["metric"][0]
optimal_rounds = np.argmin(lgb_cv[str(loss) + '-mean'])
best_cv_score = min(lgb_cv[str(loss) + '-mean'])

print("\nOptimal Round: {}\nOptimal Score: {} + {}".format(
    optimal_rounds,best_cv_score,lgb_cv[str(loss) + '-stdv'][optimal_rounds]))

results = results.append({"Rounds": optimal_rounds,
                          "Score": best_cv_score,
                          "STDV": lgb_cv[str(loss) + '-stdv'][optimal_rounds],
                          "LB": None,
                          "Parameters": lgbm_params}, ignore_index=True)
if Home is True:
    with open('results.csv', 'a') as f:
        results.to_csv(f, header=False)

In [None]:
espacio = {"n_estimators": (1, 3000),
           'min_samples_split': (2,50),
           'min_samples_leaf': (1,50),
           'max_depth':(1, 20),
           'class_weight': ["balanced", "balanced_subsample"],
           "bootstrap": [True, False],
           "criterion": ["gini", "entropy"],}

In [None]:
y = df['Admin_Hospi']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = BayesSearchCV(RandomForestClassifier(),
                      espacio,
                      scoring='roc_auc',
                      n_points= 5,
                      n_iter= 50,
                      cv= 5,
                      verbose = 3,
                      n_jobs=6,
                      random_state= 88)
model.fit(X_train, y_train)
clf = model.best_estimator_
y_model = pd.Series(clf.predict(X_test), index=X_test.index)
roc, pr = plot_AUC_ROC_PR('LGBMClassifier_V5_'+'Admin_Hospi', clf, X_test, y_test)
t1, t2 = Interval(y_test, y_model, alpha = 5.0)
print(t1, t2)
record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
run = {'Fecha_hora':pd.to_datetime('today'),
       'Tamaño_test': X_test.shape, #i,
       'Dependiente':'Admin_Hospi',
       'Modelo': 'RandomForestClassifier_MM',#u,
       'Parametros': model.best_params_,
       'ROC_AUC_train': model.best_score_,
       'ROC_AUC_Score': roc,
       'Intervalo_ROC_AUC':t1, 
       'PR_AUC_Score': pr,
       'Intervalo_PR_AUC':t2,
       'f1_score': f1_score(y_test, y_model), 
       'Accuracy': accuracy_score(y_test, y_model),
       'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
       'vars':', '.join(X_train.columns)
      }
record = record.append(run, ignore_index=True)
record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
joblib.dump(clf, 'models/RF_Numericas_dummy.joblib')
plot_confusion_matrix(clf, X_test, y_test, normalize='true')

In [None]:
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values[1], X_train)

In [None]:
espacio = {'solver': ['liblinear', 'saga'],  
           'penalty': ['l1','l2'],
           'tol': (1e-5, 1e-3, 'log-uniform'),
           'C': (1e-5, 100, 'log-uniform'),
           'fit_intercept': [True, False]}
y = df['Admin_Hospi']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)

In [None]:
model = BayesSearchCV(LogisticRegression(),
                      espacio,
                      scoring='roc_auc',
                      n_points= 5,
                      n_iter= 50,
                      cv= 5,
                      verbose = 0,
                      n_jobs=6,
                      random_state= 88)
model.fit(X_train, y_train)
clf = model.best_estimator_
y_model = pd.Series(clf.predict(X_test), index=X_test.index)
roc, pr = plot_AUC_ROC_PR('LGBMClassifier_V5_'+'Admin_Hospi', clf, X_test, y_test)
t1, t2 = Interval(y_test, y_model, alpha = 5.0)
print(t1, t2)
record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
run = {'Fecha_hora':pd.to_datetime('today'),
       'Tamaño_test': X_test.shape, #i,
       'Dependiente':'Admin_Hospi',
       'Modelo': 'LogisticRegression_MM',#u,
       'Parametros': model.best_params_,
       'ROC_AUC_train': model.best_score_,
       'ROC_AUC_Score': roc,
       'Intervalo_ROC_AUC':t1, 
       'PR_AUC_Score': pr,
       'Intervalo_PR_AUC':t2,
       'f1_score': f1_score(y_test, y_model), 
       'Accuracy': accuracy_score(y_test, y_model),
       'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
       'vars':', '.join(X_train.columns)
      }
record = record.append(run, ignore_index=True)
record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
joblib.dump(clf, 'models/LR_Numericas_dummy.joblib')
plot_confusion_matrix(clf, X_test, y_test, normalize='true')

In [None]:
print('intercept ', clf.intercept_[0])
print('classes', clf.classes_)
pd.DataFrame({'coeff': clf.coef_[0]}, 
             index=X.columns)

In [None]:
from sklearn.naive_bayes import GaussianNB
espacio = {'var_smoothing': Real(1e-9, 1, prior='log-uniform')}

model = BayesSearchCV(GaussianNB(),
                      espacio,
                      scoring='roc_auc',
                      #n_points= 5,
                      n_iter= 50,
                      cv= 5,
                      verbose = 0,
                      n_jobs=6,
                      random_state= 88)
model.fit(X_train, y_train)
clf = model.best_estimator_
y_model = pd.Series(clf.predict(X_test), index=X_test.index)
roc, pr = plot_AUC_ROC_PR('GaussianNB_V5_'+'Admin_Hospi', clf, X_test, y_test)
t1, t2 = Interval(y_test, y_model, alpha = 5.0)
print(t1, t2)
record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
run = {'Fecha_hora':pd.to_datetime('today'),
       'Tamaño_test': X_test.shape, #i,
       'Dependiente':'Admin_Hospi',
       'Modelo': 'GaussianNB_MM',#u,
       'Parametros': model.best_params_,
       'ROC_AUC_train': model.best_score_,
       'ROC_AUC_Score': roc,
       'Intervalo_ROC_AUC':t1, 
       'PR_AUC_Score': pr,
       'Intervalo_PR_AUC':t2,
       'f1_score': f1_score(y_test, y_model), 
       'Accuracy': accuracy_score(y_test, y_model),
       'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
       'vars':', '.join(X_train.columns)
      }
record = record.append(run, ignore_index=True)
record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
joblib.dump(clf, 'models/NB_Numericas_dummy.joblib')
plot_confusion_matrix(clf, X_test, y_test, normalize='true')

In [None]:
X_train.columns

In [None]:
df['temperature_dummy'] = 0
df.loc[(df['temperature']<96.1)|(df['temperature']>99.2), 'temperature_dummy'] = 1
df['heartrate_dummy'] = 0
df.loc[(df['heartrate']<60)|(df['heartrate']>104), 'heartrate_dummy'] = 1
df['resprate_dummy'] = 0
df.loc[(df['resprate']<15)|(df['resprate']>19), 'resprate_dummy'] = 1
df['o2sat_dummy'] = 0
df.loc[(df['o2sat']<95), 'o2sat_dummy'] = 1
df['Hora_in'] = df['intime'].dt.hour

MM = MinMaxScaler()
df[['temperature_MM', 'heartrate_MM',
    'resprate_MM', 'o2sat_MM',
    'sbp_MM', 'dbp_MM',
    'pain_MM', 'Age_MM']] = MM.fit_transform(df[['temperature', 'heartrate',
                                                 'resprate', 'o2sat',
                                                 'sbp', 'dbp',
                                                 'pain', 'Age']])

X_0 = pd.merge(df[['temperature_MM', 'heartrate_MM',
                   'resprate_MM', 'o2sat_MM',
                   'sbp_MM', 'dbp_MM',
                   'pain_MM', 'Age_MM',
                   'gender', 'Reingresos_menores_72', #'Hora_in',
                   'temperature_dummy', 'heartrate_dummy', 'resprate_dummy', 'o2sat_dummy']],
               pd.get_dummies(df['Presión arterial']).drop(columns=['Normal']),
               left_index=True, right_index=True)
X = pd.merge(X_0, pd.DataFrame(features, index= df.index, columns=tfidf.get_feature_names()),
             left_index=True, right_index=True)
X['anomalos'] = X[['temperature_dummy', 'heartrate_dummy',
                   'resprate_dummy', 'o2sat_dummy', 'Baja',
                   'Elevada', 'Estadio 1', 'Estadio 2']].sum(axis=1)
X.shape  

In [None]:
df['ICU_QX'] = df['ICU'] + df['Surgery']
df['ICU_QX'].value_counts(normalize=True)

In [None]:
%%time
y = df['ICU_QX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
model = BayesSearchCV(LGBMClassifier(),
                      espacio,
                      scoring='roc_auc',
                      n_points= 5,
                      n_iter= 50,
                      cv= 5,
                      verbose = 0,
                      n_jobs=6,
                      random_state= 88)
model.fit(X_train, y_train)
clf = model.best_estimator_
y_model = pd.Series(clf.predict(X_test), index=X_test.index)
roc, pr = plot_AUC_ROC_PR('LGBMClassifier_V5_'+'Admin_Hospi', clf, X_test, y_test)
t1, t2 = Interval(y_test, y_model, alpha = 5.0)
print(t1, t2)
record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
run = {'Fecha_hora':pd.to_datetime('today'),
       'Tamaño_test': X_test.shape, #i,
       'Dependiente':'Admin_Hospi', # ------------------------------------CAMBIAR NOMBRE
       'Modelo': 'LGBMClassifier_V6_MM',#u,
       'Parametros': model.best_params_,
       'ROC_AUC_train': model.best_score_,
       'ROC_AUC_Score': roc,
       'Intervalo_ROC_AUC':t1, 
       'PR_AUC_Score': pr,
       'Intervalo_PR_AUC':t2,
       'f1_score': f1_score(y_test, y_model), 
       'Accuracy': accuracy_score(y_test, y_model),
       'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
      }
record = record.append(run, ignore_index=True)
record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
plot_confusion_matrix(clf, X_test, y_test, normalize='true')

In [None]:
%%time
y = df['ICU_QX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
print(X_train.shape, X_test.shape)
print(X_train.shape, X_resampled.shape)

model = BayesSearchCV(LGBMClassifier(),
                      espacio,
                      scoring='roc_auc',
                      n_points= 5,
                      n_iter= 50,
                      cv= 5,
                      verbose = 0,
                      n_jobs=6,
                      random_state= 88)
model.fit(X_resampled, y_resampled)
clf = model.best_estimator_
y_model = pd.Series(clf.predict(X_test), index=X_test.index)
roc, pr = plot_AUC_ROC_PR('LGBMClassifier_V5_'+'Admin_Hospi', clf, X_test, y_test)
t1, t2 = Interval(y_test, y_model, alpha = 5.0)
print(t1, t2)
record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
run = {'Fecha_hora':pd.to_datetime('today'),
       'Tamaño_test': X_test.shape, #i,
       'Dependiente':'Admin_Hospi', # ------------------------------------CAMBIAR NOMBRE
       'Modelo': 'LGBMClassifier_V6_MM_ADASYN',#u,
       'Parametros': model.best_params_,
       'ROC_AUC_train': model.best_score_,
       'ROC_AUC_Score': roc,
       'Intervalo_ROC_AUC':t1, 
       'PR_AUC_Score': pr,
       'Intervalo_PR_AUC':t2,
       'f1_score': f1_score(y_test, y_model), 
       'Accuracy': accuracy_score(y_test, y_model),
       'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
      }
record = record.append(run, ignore_index=True)
record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
plot_confusion_matrix(clf, X_test, y_test, normalize='true')

In [None]:
df_2 = df[df['Admin_Hospi']==1]
df_2.shape

In [None]:
df_2['ICU_QX'].value_counts(normalize=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=(1,1),
                        stop_words=None,
                        lowercase=False,
                        max_df=1.,
                        min_df=10,
                        max_features=200,
                        norm='l2',
                        sublinear_tf=True)
# se aplica el TFIDF
features = tfidf.fit_transform(df_2['chiefcomplaint']).toarray()
features.shape

X_0 = pd.merge(df_2[['temperature_MM', 'heartrate_MM',
                     'resprate_MM', 'o2sat_MM',
                     'sbp_MM', 'dbp_MM',
                     'pain_MM', 'Age_MM',
                     'gender', 'Reingresos_menores_72', #'Hora_in',
                     'temperature_dummy', 'heartrate_dummy', 'resprate_dummy', 'o2sat_dummy']],
               pd.get_dummies(df['Presión arterial']).drop(columns=['Normal']),
               left_index=True, right_index=True)
X = pd.merge(X_0, pd.DataFrame(features, index= df_2.index, columns=tfidf.get_feature_names()),
             left_index=True, right_index=True)
X['anomalos'] = X[['temperature_dummy', 'heartrate_dummy', 'resprate_dummy',
                   'o2sat_dummy', 'Baja', 'Elevada', 'Estadio 1', 'Estadio 2']].sum(axis=1)
X.shape  

In [None]:
%%time
y = df_2['ICU_QX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)

model = BayesSearchCV(LGBMClassifier(),
                      espacio,
                      scoring='roc_auc',
                      n_points= 5,
                      n_iter= 50,
                      cv= 5,
                      verbose = 0,
                      n_jobs=6,
                      random_state= 88)
model.fit(X_train, y_train)
clf = model.best_estimator_
y_model = pd.Series(clf.predict(X_test), index=X_test.index)
roc, pr = plot_AUC_ROC_PR('LGBMClassifier_V5_'+'Admin_Hospi', clf, X_test, y_test)
t1, t2 = Interval(y_test, y_model, alpha = 5.0)
print(t1, t2)
record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
run = {'Fecha_hora':pd.to_datetime('today'),
       'Tamaño_test': X_test.shape, #i,
       'Dependiente':'ICU_QX',
       'Modelo': 'LGBMClassifier_V6_BA',#u,
       'Parametros': model.best_params_,
       'ROC_AUC_train': model.best_score_,
       'ROC_AUC_Score': roc,
       'Intervalo_ROC_AUC':t1, 
       'PR_AUC_Score': pr,
       'Intervalo_PR_AUC':t2,
       'f1_score': f1_score(y_test, y_model), 
       'Accuracy': accuracy_score(y_test, y_model),
       'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
      }
record = record.append(run, ignore_index=True)
record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
plot_confusion_matrix(clf, X_test, y_test, normalize='true')

In [None]:
%%time
y = df_2['ICU_QX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_resampled, y_resampled = RandomOverSampler().fit_resample(X_train, y_train)
print(X_train.shape, X_test.shape)
print(X_train.shape, X_resampled.shape)

model = BayesSearchCV(LGBMClassifier(),
                      espacio,
                      scoring='roc_auc',
                      n_points= 5,
                      n_iter= 50,
                      cv= 5,
                      verbose = 0,
                      n_jobs=6,
                      random_state= 88)
model.fit(X_resampled, y_resampled)
clf = model.best_estimator_
y_model = pd.Series(clf.predict(X_test), index=X_test.index)
roc, pr = plot_AUC_ROC_PR('LGBMClassifier_V5_'+'Admin_Hospi', clf, X_test, y_test)
t1, t2 = Interval(y_test, y_model, alpha = 5.0)
print(t1, t2)
record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
run = {'Fecha_hora':pd.to_datetime('today'),
       'Tamaño_test': X_test.shape, #i,
       'Dependiente':'ICU_QX',
       'Modelo': 'LGBMClassifier_V6_ROS',#u,
       'Parametros': model.best_params_,
       'ROC_AUC_train': model.best_score_,
       'ROC_AUC_Score': roc,
       'Intervalo_ROC_AUC':t1, 
       'PR_AUC_Score': pr,
       'Intervalo_PR_AUC':t2,
       'f1_score': f1_score(y_test, y_model), 
       'Accuracy': accuracy_score(y_test, y_model),
       'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
      }
record = record.append(run, ignore_index=True)
record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
plot_confusion_matrix(clf, X_test, y_test, normalize='true')

In [None]:
%%time
y = df_2['ICU_QX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
print(X_train.shape, X_test.shape)
print(X_train.shape, X_resampled.shape)

model = BayesSearchCV(LGBMClassifier(),
                      espacio,
                      scoring='roc_auc',
                      n_points= 5,
                      n_iter= 50,
                      cv= 5,
                      verbose = 0,
                      n_jobs=6,
                      random_state= 88)
model.fit(X_resampled, y_resampled)
clf = model.best_estimator_
y_model = pd.Series(clf.predict(X_test), index=X_test.index)
roc, pr = plot_AUC_ROC_PR('LGBMClassifier_V5_'+'Admin_Hospi', clf, X_test, y_test)
t1, t2 = Interval(y_test, y_model, alpha = 5.0)
print(t1, t2)
record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
run = {'Fecha_hora':pd.to_datetime('today'),
       'Tamaño_test': X_test.shape, #i,
       'Dependiente':'ICU_QX',
       'Modelo': 'LGBMClassifier_V6_SMOTE',#u,
       'Parametros': model.best_params_,
       'ROC_AUC_train': model.best_score_,
       'ROC_AUC_Score': roc,
       'Intervalo_ROC_AUC':t1, 
       'PR_AUC_Score': pr,
       'Intervalo_PR_AUC':t2,
       'f1_score': f1_score(y_test, y_model), 
       'Accuracy': accuracy_score(y_test, y_model),
       'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
      }
record = record.append(run, ignore_index=True)
record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
plot_confusion_matrix(clf, X_test, y_test, normalize='true')

In [None]:
y = df_2['ICU_QX']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
print(X_train.shape, X_test.shape)
print(X_train.shape, X_resampled.shape)

model = BayesSearchCV(LGBMClassifier(),
                      espacio,
                      scoring='roc_auc',
                      n_points= 5,
                      n_iter= 50,
                      cv= 5,
                      verbose = 0,
                      n_jobs=6,
                      random_state= 88)
model.fit(X_resampled, y_resampled)
clf = model.best_estimator_
y_model = pd.Series(clf.predict(X_test), index=X_test.index)
roc, pr = plot_AUC_ROC_PR('LGBMClassifier_V5_'+'Admin_Hospi', clf, X_test, y_test)
t1, t2 = Interval(y_test, y_model, alpha = 5.0)
print(t1, t2)
record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
run = {'Fecha_hora':pd.to_datetime('today'),
       'Tamaño_test': X_test.shape, #i,
       'Dependiente':'Admin_Hospi',
       'Modelo': 'LGBMClassifier_V6_ADASYN',#u,
       'Parametros': model.best_params_,
       'ROC_AUC_train': model.best_score_,
       'ROC_AUC_Score': roc,
       'Intervalo_ROC_AUC':t1, 
       'PR_AUC_Score': pr,
       'Intervalo_PR_AUC':t2,
       'f1_score': f1_score(y_test, y_model), 
       'Accuracy': accuracy_score(y_test, y_model),
       'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
      }
record = record.append(run, ignore_index=True)
record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)
plot_confusion_matrix(clf, X_test, y_test, normalize='true')

In [None]:
labels = df['Destino']
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=(1,1),
                        stop_words=None,
                        lowercase=False,
                        max_df=1.,
                        min_df=10,
                        max_features=1000,
                        norm='l2',
                        sublinear_tf=True)
# se aplica el TFIDF
features = tfidf.fit_transform(df['chiefcomplaint']).toarray()

from sklearn.feature_selection import chi2
for category_id in sorted(labels.unique()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' '))==1]
    print('DESTINO {}:'.format(category_id))
    print('Unigramas más correlacionados:\n{}'.format("', '".join(unigrams[-100:])))
    print('')

In [None]:
labels = df['Admin_Hospi']
from sklearn.feature_selection import chi2
for category_id in sorted(labels.unique()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' '))==1]
    print('DESTINO {}:'.format(category_id))
    print('Unigramas más correlacionados:\n{}'.format("', '".join(unigrams[-50:])))
    print('')

In [None]:
df_features = pd.DataFrame(features, index= df.index, columns=tfidf.get_feature_names())
df_features

In [None]:
y = df['Hospitalization']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LGBMClassifier(learning_rate= 0.07435133934470071, max_depth=4, n_estimators=3000, num_leaves= 500, subsample=0.6586814364523186)
model.fit(X_train, y_train)
y_model = pd.Series(model.predict(X_test), index=X_test.index)
f1_score(y_test, y_model), accuracy_score(y_test, y_model), roc_auc_score(y_test, y_model)

In [None]:
from lightgbm import plot_importance
plot_importance(model, figsize=(10,35))

In [None]:
param_dist = {'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
             'multi_class': ['ovr'],
             'max_iter': np.arange(1,1000)}
n_iter_search = 100
model = RandomizedSearchCV(LogisticRegression(),
                               param_distributions=param_dist,
                               n_iter=n_iter_search,
                               cv=5, scoring='roc_auc',
                               n_jobs=-1,
                               verbose=0)

Feature engineering
marcación reingreso <72 horas
número de signos anormales
dummies anormales por signo vital
admisiones previas


Otros:
triage
examenes
medicamientos
diagnóstico

In [None]:
clasificadores = {'Regresión Logística':LogisticRegression(),
                  'Bosque aleatorio':RandomForestClassifier(),
                  'Potenciación del Gradiente':LGBMClassifier()}
espacios = {LogisticRegression():{},
            RandomForestClassifier():{},
            LGBMClassifier():{"n_estimators": Integer(10, 3000),
                              'max_depth':Integer(1, 40),
                              'num_leaves': Integer(2, 500),
                              'learning_rate': Real(0.001, 0.3, prior='uniform'),
                              'subsample':Real(0.2, 1, prior='uniform')},}

pruebas = [0.9, 0.8] #[0.5, 0.4, 0.3, 0.2]
for i in pruebas:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=88)
    for u, v in clasificadores:
        # Se realiza el ajuste de hiperparametros
        model = BayesSearchCV(clasificadores[u], espacios[u],
                              n_points= 5,
                              n_iter= 50,
                              cv= 5,
                              verbose = 0,
                              n_jobs=-1, random_state= 88)
        # Se entrena el modelo
        model.fit(X_train, y_train)
        # Se extrae el mejor
        clf = model.best_estimator_
        y_model = clf.predict(X_test)
        # Guardo los gráficos de ROC y PR
        roc, pr = plot_AUC_ROC_PR(u+'_'+str(i), clf, X_test, y_test)
        
        record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
        run = {'Fecha_hora':pd.to_datetime('today'),
               'Tamaño_test': i,
               'Modelo': u,
               'Parametros': model.best_params_,
               'ROC_AUC_train': model.best_score_,
               'ROC_AUC_Score': roc,
               'PR_AUC_Score': pr
              }
        record = record.append(run, ignore_index=True)
        record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)


print()

In [None]:
for i in ['Hospitalization', 'ICU', 'Surgery']:
    print(i)
    y = df[i]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    print(X_train.shape)
    print(X_test.shape)
    model = BayesSearchCV(LGBMClassifier(),
                          scoring='roc_auc',
                          espacio,
                          n_points= 5,
                          n_iter= 50,
                          cv= 5,
                          verbose = 0,
                          n_jobs=6,
                          random_state= 88)
    model.fit(X_train, y_train)
    clf = model.best_estimator_
    y_model = pd.Series(clf.predict(X_test), index=X_test.index)
    roc, pr = plot_AUC_ROC_PR('LGBMClassifier_V5_'+i, clf, X_test, y_test)
    t1, t2 = Interval(y_test, y_model, alpha = 5.0)
    print(t1, t2)
    record = pd.read_excel('outputs/Tabla_Resultados.xlsx', engine='openpyxl')
    run = {'Fecha_hora':pd.to_datetime('today'),
           'Tamaño_test': X_test.shape, #i,
           'Dependiente':i,
           'Modelo': 'LGBMClassifier_V5',#u,
           'Parametros': model.best_params_,
           'ROC_AUC_train': model.best_score_,
           'ROC_AUC_Score': roc,
           'Intervalo_ROC_AUC':t1, 
           'PR_AUC_Score': pr,
           'Intervalo_PR_AUC':t2,
           'f1_score': f1_score(y_test, y_model), 
           'Accuracy': accuracy_score(y_test, y_model),
           'balanced_accuracy': balanced_accuracy_score(y_test, y_model),
          }
    record = record.append(run, ignore_index=True)
    record.to_excel('outputs/Tabla_Resultados.xlsx', index=False)