### Imbalanced XGBoost
--------------
변경사항
- 1) current_date
- 2) outcome_name ; 변경하면서 다시 재실행 
--------------

In [None]:
import json
with open('./../{}'.format("config.json")) as file:
    cfg = json.load(file)

In [None]:
current_date = cfg["working_date"]
# outcome_list=['meloxicam', 'celecoxib', 'valproic_acid, lamotrigine']
outcome_name = 'meloxicam' 

In [None]:
import os
import pathlib
current_dir = pathlib.Path.cwd()
parent_dir = current_dir.parent
data_dir = pathlib.Path('{}/data/{}/preprocess/{}/'.format(parent_dir, current_date, outcome_name))
output_result_dir = pathlib.Path('{}/result/{}/imxgboost/{}/'.format(parent_dir, current_date, outcome_name))
pathlib.Path.mkdir(output_result_dir, mode=0o777, parents=True, exist_ok=True)

file_list = os.listdir(data_dir)
file_list = [pathlib.Path(filename).with_suffix('').name for filename in file_list]
file_list

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, LeaveOneOut, cross_validate, GridSearchCV
from imxgboost.imbalance_xgb import imbalance_xgboost as imb_xgb
from model_estimation import *
import functools

concat_df = pd.read_csv('{}/{}.txt'.format(data_dir, outcome_name), index_col=False)

concat_df['cohort_start_date'] = pd.to_datetime(concat_df['cohort_start_date'], format='%Y-%m-%d %H:%M:%S', errors='raise')
concat_df['first_abnormal_date'] = pd.to_datetime(concat_df['first_abnormal_date'], format='%Y-%m-%d %H:%M:%S', errors='raise')
concat_df['concept_date'] = pd.to_datetime(concat_df['concept_date'], format='%Y-%m-%d %H:%M:%S', errors='raise')
    
# concat_df['duration'] = (concat_df['concept_date']-concat_df['cohort_start_date']).dt.days+1
concat_df = concat_df.drop(['person_id', 'cohort_start_date', 'concept_date', 'first_abnormal_date'], axis=1)

### @change column name ; column에 json파일 구분자가 들어가면 plot을 그리지 못함. 
import re
concat_df.columns = concat_df.columns.str.translate("".maketrans({"[":"(", "]":")"}))
concat_df = concat_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_ /()]+', '', x))
concat_df.columns

### @환자수 확인
print("label_1 : ",len(concat_df[concat_df["label"] == 1]))
print("label_0 : ",len(concat_df[concat_df["label"] == 0]))

### @x, y데이터 분할 
def split_x_y_data(df) :
    y_data = df['label'].T.reset_index(drop=True) 
    x_data = df.drop('label', axis=1)
    new_col = x_data.columns
    return x_data, y_data, new_col

x_data, y_data, new_col = split_x_y_data(concat_df)

### @train/test dataset 구분 
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=1, stratify=y_data)

print("data  : ", x_data.shape, y_data.shape)
print("train : ", x_train.shape, y_train.shape)
print("test  : ", x_test.shape, y_test.shape)

In [None]:
### @ imblanced xgboost

xgboster_focal = imb_xgb(special_objective='focal', num_round=200, max_depth=5, eta=0.1)
xgboster_weight = imb_xgb(special_objective='weighted')
CV_focal_booster = GridSearchCV(xgboster_focal, {"focal_gamma":[1.0,1.5,2.0,2.5,3.0]})
CV_weight_booster = GridSearchCV(xgboster_weight, {"imbalance_alpha":[1.5,2.0,2.5,3.0,4.0]})

CV_focal_booster.fit(x_train.to_numpy(), y_train.to_numpy())
CV_weight_booster.fit(x_train.to_numpy(), y_train.to_numpy())

opt_focal_booster = CV_focal_booster.best_estimator_
opt_weight_booster = CV_weight_booster.best_estimator_

raw_output = opt_focal_booster.predict(x_test.to_numpy(), y=None)
rmse = np.sqrt(mean_squared_error(y_test.to_numpy(), raw_output))
print("RMSE: %f" % (rmse))

sigmoid_output = opt_focal_booster.predict_sigmoid(x_test.to_numpy(), y=None) 
rmse = np.sqrt(mean_squared_error(y_test.to_numpy(), sigmoid_output))
print("RMSE: %f" % (rmse))

class_output = opt_focal_booster.predict_determine(x_test.to_numpy(), y=None) 
rmse = np.sqrt(mean_squared_error(y_test.to_numpy(), class_output))
print("RMSE: %f" % (rmse))

prob_output = opt_focal_booster.predict_two_class(x_test.to_numpy(), y=None) 
# rmse = np.sqrt(mean_squared_error(y_test.to_numpy(), prob_output))
# print("RMSE(prob_output): %f" % (rmse))

preds = opt_focal_booster.predict(x_test.to_numpy(), y=None) 
rmse = np.sqrt(mean_squared_error(y_test.to_numpy(), preds))
print("RMSE(preds): %f" % (rmse))

# retrieve the best parameters
xgboost_opt_param = CV_focal_booster.best_params_
# instantialize an imbalance-xgboost instance
xgboost_opt = imb_xgb(special_objective='focal', num_round=200, max_depth=5, eta=0.1, **xgboost_opt_param)

# # cross-validation
# # initialize the splitter
# loo_splitter = LeaveOneOut()
# # initialize the score evalutation function by feeding the 'mode' argument
# # 'mode' can be [\'accuracy\', \'precision\',\'recall\',\'f1\',\'MCC\']
# score_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='accuracy')
# # Leave-One cross validation
# loo_info_dict = cross_validate(xgboost_opt, X=x_data.to_numpy(), y=y_data.to_numpy(), cv=loo_splitter, scoring=make_scorer(score_eval_func))


In [None]:
xgboost_opt.fit(x_train.to_numpy(), y_train.to_numpy())

In [None]:
dtest = xgb.DMatrix(x_test)
prediction_output = xgboost_opt.boosting_model.predict(dtest)

In [None]:
# pred_probs = xgboost_opt.predict(data_x=x_test , y=y_test)
pred_probs = xgboost_opt.predict(data_x=x_test, y=y_test)

# 예측 확률이 0.5 보다 크면 1 , 그렇지 않으면 0 으로 예측값 결정하여 List 객체인 preds 에 저장
y_pred = [ 1 if x >= 0.5 else 0 for x in pred_probs ]
get_clf_eval(y_test, y_pred, pred_probs)

### @ save : plot tree & plot importance feature 
make_plot_tree(xgboost_opt.boosting_model, output_result_dir, outcome_name, rankdir=None)
make_plot_tree(xgboost_opt.boosting_model, output_result_dir, outcome_name, rankdir='LR')
make_plot_importance(xgboost_opt.boosting_model, output_result_dir, outcome_name)

### @ save : clf report & model estimation & confusion matrix & roc
clf_report(y_test, y_pred, output_result_dir, outcome_name)
model_performance_evaluation(y_test, y_pred, pred_probs, output_result_dir, outcome_name)
confusion_matrix_figure(y_test, y_pred, output_result_dir, outcome_name)
confusion_matrix_figure2(y_test, y_pred, output_result_dir, outcome_name)
AUC, ACC = ROC_AUC(y_test, y_pred, output_result_dir, outcome_name)

### @ save : model json
save_xgb_model_json(xgboost_opt.boosting_model, output_result_dir, outcome_name)

In [None]:
xgboost_opt_param

In [None]:
loo_info_dict

In [None]:
xgboost_opt.correct_eval_func()

In [None]:
score_dict[]
print(f"scoring is invalid (got {scoring!r}). Refer to the "
        "scoring glossary for details: "
        "https://scikit-learn.org/stable/glossary.html#term-scoring")

In [None]:
a = [TP_eval_func, TN_eval_func, FP_eval_func, FN_eval_func]

In [None]:
TP_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='TP')
TN_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='FP')
FP_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='TN')
FN_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='FN')

In [None]:
print(f"scoring is invalid (got {make_scorer(TP_eval_func)!r}). Refer to the "
        "scoring glossary for details: "
        "https://scikit-learn.org/stable/glossary.html#term-scoring")

In [None]:

# initialize the correctness evalutation function by feeding the 'mode' argument
# 'mode' can be ['TP', 'TN', 'FP', 'FN']
TP_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='TP')
TN_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='FP')
FP_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='TN')
FN_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='FN')
# define the score function dictionary
score_dict = {'TP': make_scorer(TP_eval_func), 
              'FP': make_scorer(TN_eval_func), 
              'TN': make_scorer(FP_eval_func), 
              'FN': make_scorer(FN_eval_func)}

score_dict.__name__ = "evalutation function"
# Leave-One cross validation
loo_info_dict = cross_validate(xgboost_opt, X=x_data.to_numpy(), y=y_data.to_numpy(), cv=loo_splitter, scoring=score_dict)
overall_tp = np.sum(loo_info_dict['test_TP']).astype('float')


In [None]:
# from functools import partial
# class WrappablePartial(functools.partial):
#     @property
#     def __module__(self):
#         return self.func.__module__
#     @property
#     def __name__(self):
#         return "functools.partial({}, *{}, **{})".format(
#             self.func.__name__,
#             self.args,
#             self.keywords
#         )
#     @property
#     def __doc__(self):
#         return self.func.__doc__
   
from functools import wraps

TP_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='TP')
TN_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='FP')
FP_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='TN')
FN_eval_func = functools.partial(xgboost_opt.score_eval_func, mode='FN')
# define the score function dictionary
score_dict = {'TP': make_scorer(TP_eval_func), 
              'FP': make_scorer(TN_eval_func), 
              'TN': make_scorer(FP_eval_func), 
              'FN': make_scorer(FN_eval_func)}
# Leave-One cross validation
loo_info_dict = cross_validate(xgboost_opt, X=x_data.to_numpy(), y=y_data.to_numpy(), cv=loo_splitter, scoring=score_dict)
overall_tp = np.sum(loo_info_dict['test_TP']).astype('float')

In [None]:

params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1, 'max_depth': 5, 'alpha': 10}


In [None]:
xgboost_opt.boosting_model.predict(dtest)

In [None]:
dtrain = xgb.DMatrix(data=x_train , label=y_train) 
dtest = xgb.DMatrix(data=x_test , label=y_test)

params = { 'max_depth':5, 'learning_rate': 0.1, 'objective':'binary:logistic', 'eval_metric':'logloss' }
num_rounds = 50
# train 데이터 셋은 ‘train’ , evaluation(test) 데이터 셋은 ‘eval’ 로 명기합니다. 
wlist = [(dtrain,'train'),(dtest,'eval')]
# 하이퍼 파라미터와 early stopping 파라미터를 train( ) 함수의 파라미터로 전달 
xgb_model = xgb.train(params = params, dtrain=dtrain, num_boost_round=num_rounds, early_stopping_rounds=200, evals=wlist )
pred_probs = xgb_model.predict(dtest)

# 예측 확률이 0.5 보다 크면 1 , 그렇지 않으면 0 으로 예측값 결정하여 List 객체인 preds 에 저장
y_pred = [ 1 if x >= 0.5 else 0 for x in pred_probs ]
get_clf_eval(y_test, y_pred, pred_probs)

make_plot_tree(xgb_model, output_result_dir, outcome_name, rankdir=None)
make_plot_tree(xgb_model, output_result_dir, outcome_name, rankdir='LR')
make_plot_importance(xgb_model, output_result_dir, outcome_name)

clf_report(y_test, y_pred, output_result_dir, outcome_name)
model_performance_evaluation(y_test, y_pred, pred_probs, output_result_dir, outcome_name)
confusion_matrix_figure(y_test, y_pred, output_result_dir, outcome_name)
confusion_matrix_figure2(y_test, y_pred, output_result_dir, outcome_name)
AUC, ACC = ROC_AUC(y_test, y_pred, output_result_dir, outcome_name)

save_xgb_model_json(xgb_model, output_result_dir, outcome_name)

In [None]:
# from lightgbm import LGBMClassifier, plot_importance

# def get_model_train_eval(model, ftr_train=None, ftr_test=None, tgt_train=None, tgt_test=None):
#     model.fit(ftr_train, tgt_train)
#     pred = model.predict(ftr_test)
#     pred_proba = model.predict_proba(ftr_test)[:, 1]
#     get_clf_eval(tgt_test, pred, pred_proba)
        
#     fig, ax = plt.subplots(figsize=(10,12))
#     plot_importance(model, ax=ax)
#     plt.show()
    
# lgbm_clf = LGBMClassifier(n_estimators=400, num_leaves=10, n_jobs=-1, boost_from_average=False)
# get_model_train_eval(lgbm_clf, ftr_train=x_train, ftr_test=x_test, tgt_train=y_train, tgt_test=y_test)

In [None]:
# xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
#                 max_depth = 5, alpha = 10, n_estimators = 10)

# xg_reg.fit(x_train,y_train)

# pred_probs = xg_reg.predict(x_test)     

# rmse = np.sqrt(mean_squared_error(y_test, pred_probs))
# print("RMSE: %f" % (rmse))

# # 예측 확률이 0.5 보다 크면 1 , 그렇지 않으면 0 으로 예측값 결정하여 List 객체인 preds 에 저장
# y_pred = [ 1 if x > 0.5 else 0 for x in pred_probs ]

# data_dmatrix = xgb.DMatrix(data=x_data,label=y_data)

# params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
#                 'max_depth': 5, 'alpha': 10}
                
# cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
#                     num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
                    
# cv_results.head()
# print((cv_results["test-rmse-mean"]).tail(1))

# xgb_model = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=100)

# make_plot_tree(xgb_model, output_result_dir, outcome_name, rankdir=None)
# make_plot_tree(xgb_model, output_result_dir, outcome_name, rankdir='LR')
# make_plot_importance(xgb_model, output_result_dir, outcome_name)

# clf_report(y_test, y_pred, output_result_dir, outcome_name)
# model_performance_evaluation(y_test, y_pred, pred_probs, output_result_dir, outcome_name)
# confusion_matrix_figure(y_test, y_pred, output_result_dir, outcome_name)
# confusion_matrix_figure2(y_test, y_pred, output_result_dir, outcome_name)
# AUC, ACC = ROC_AUC(y_test, y_pred, output_result_dir, outcome_name)