# 1. 하이퍼파라미터 최적화

In [1]:
from copy import deepcopy

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.graphics.tsaplots import plot_acf,plot_pacf

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,train_test_split
from skopt import BayesSearchCV
from sklearn.metrics import recall_score, accuracy_score,f1_score,roc_auc_score,confusion_matrix
import xgboost as xgb

import warnings

In [92]:
def preprocess_and_split_data(X, y):
    """
    전처리와 데이터 분할을 수행하는 함수입니다.
    """
    X = pd.get_dummies(X)
    X = X.rename(columns = lambda x:x.replace(',', ' '))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)

    print(f"훈련 데이터셋의 입력 데이터의 크기: {X_train.shape}, 타겟값의 크기: {y_train.shape}")
    print(f"테스트 데이터셋의 입력 데이터의 크기: {X_test.shape}, 타겟값의 크기: {y_test.shape}")
    print(f"검증 데이터셋의 입력 데이터의 크기: {X_val.shape}, 타겟값의 크기: {y_val.shape}")

    return X_train, y_train, X_test, y_test, X_val, y_val

In [93]:
def baseline_fit(X_train, y_train, X_test, y_test, X_val, y_val):
    """
    그리드서치로 하이퍼파라미터를 찾는 함수입니다.
    -------------------------------------------------------
    input = X_train, y_train, X_test, y_test, X_val, y_val
    return = 예측성능(dict), 파리미터(dict), 그리드서치 모델(object)
    -------------------------------------------------------
    """
    # 하이퍼파라미터 검색 후 예측 성능을 담는 딕셔너리
    results_dict = {}

    # xbg모델 객체 생성
    xgbc0 = xgb.XGBClassifier(objective='binary:logistic',
                            booster='gbtree',
                            eval_metric='auc',
                            tree_method='hist',
                            grow_policy='lossguide',
                            use_label_encoder=False)
    xgbc0.fit(X_train , y_train)

    # 기본 파라미터 추출
    default_params = {}
    gparams = xgbc0.get_params()

    # 기본 파리미터 리스트로 묶기
    for key in gparams.keys():
        gp = gparams[key]
        default_params[key] = [gp]

    # 그리드서치 및 교차검증
    clf0 = GridSearchCV(estimator=xgbc0, scoring='accuracy', param_grid=default_params, return_train_score=True, verbose=0, cv=3)
    clf0.fit(X_train, y_train.values.ravel())

    # 결과 데이터프레임으로 변환
    df = pd.DataFrame(clf0.cv_results_)

    # 혼동행렬
    train_predictions = clf0.predict(X_train)
    test_predictions = clf0.predict(X_test)
    unseen_predictions = clf0.predict(X_val)
    unseen_predictions_proba = clf0.predict_proba(X_val)[:, 1]
    unseen_predictions_threds = unseen_predictions_proba >= 0.4

    # 혼동행렬
    cfm_train = confusion_matrix(y_train, train_predictions)
    cfm_test = confusion_matrix(y_test, test_predictions)
    cfm_unseen = confusion_matrix(y_val, unseen_predictions)
    cfm_unseen_threds = confusion_matrix(y_val, unseen_predictions_threds)
    
    # 재현율
    accs_train = recall_score(y_train, train_predictions)
    accs_test = recall_score(y_test, test_predictions)
    accs_unseen = recall_score(y_val, unseen_predictions)
    accs_unseen_threds = recall_score(y_val, unseen_predictions_threds)
    
    # F1-Score
    f1s_train_p1 = f1_score(y_train, train_predictions, pos_label=1)
    f1s_train_p0 = f1_score(y_train, train_predictions, pos_label=0)
    f1s_test_p1 = f1_score(y_test, test_predictions, pos_label=1)
    f1s_test_p0 = f1_score(y_test, test_predictions, pos_label=0)
    f1s_unseen_p1 = f1_score(y_val, unseen_predictions, pos_label=1)
    f1s_unseen_p0 = f1_score(y_val, unseen_predictions, pos_label=0)
    f1s_unseen_threds_p1 = f1_score(y_val, unseen_predictions_threds, pos_label=1)
    f1s_unseen_threds_p0 = f1_score(y_val, unseen_predictions_threds, pos_label=0)
    
    # ROC-Score
    test_ras = roc_auc_score(y_test, clf0.predict_proba(X_test)[:,1])
    unseen_ras = roc_auc_score(y_val, clf0.predict_proba(X_val)[:,1])
    
    # 최적의 파라미터 목록
    bp = clf0.best_params_
    
    # 결과 저장
    results_dict[f'xgbc0'] = {'iterable_parameter': np.nan,
                                'classifier': deepcopy(clf0),
                                'cv_results': df.copy(),
                                'cfm_train': cfm_train,
                                'cfm_test': cfm_test,
                                'cfm_unseen': cfm_unseen,
                                'cfm_unseen_threds': cfm_unseen_threds,
                                'train_recall': accs_train,
                                'test_recall': accs_test,
                                'unseen_recall': accs_unseen,
                                'unseen_threds_recall': accs_unseen_threds,
                                'train F1-score label 1': f1s_train_p1,
                                'train F1-score label 0': f1s_train_p0,
                                'test F1-score label 1': f1s_test_p1,
                                'test F1-score label 0': f1s_test_p0,
                                'unseen F1-score label 1': f1s_unseen_p1,
                                'unseen F1-score label 0': f1s_unseen_p0,
                                'unseen threds F1-score label 1': f1s_unseen_threds_p1,
                                'unseen threds F1-score label 0': f1s_unseen_threds_p0,
                                'test roc auc score': test_ras,
                                'unseen roc auc score': unseen_ras,
                                'best_params': bp,
                                'predict_proba':unseen_predictions_proba}
    
    return results_dict, default_params, clf0

In [94]:
def coordinate_descent(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, default_params, params, results_dict, clf0):
    """
    그리드서치를 수행한 것과 coordinate descent 기법으로 찾은 최적의 하이퍼파라미터를 비교하는 함수입니다.
    -----------------------------------------------------------------------------------------------
    input = X_train, y_train, X_test, y_test, X_val, y_val, param_grid, default_params, params, results_dict, clf0
    return = 예측성능(dict), 파라미터조합개수(int)
    -----------------------------------------------------------------------------------------------
    """
    gcvj = np.cumsum([len(x) for x in param_grid.values()])[-1]

    for i,grid_key in enumerate(param_grid.keys()):
        for param_key in params.keys():
            if param_key == grid_key:
                params[param_key] = param_grid[grid_key]
            else:
                try:
                    param_value = [clf.best_params_[param_key]]
                    params[param_key] = param_value
                except:
                    param_value = [clf0.best_params_[param_key]]
                    params[param_key] = param_value
        
        xgbc = xgb.XGBClassifier(**default_params)
        
        clf = GridSearchCV(estimator=xgbc, param_grid=params, scoring='accuracy', return_train_score=True, verbose=0, cv=3)
        clf.fit(X_train, y_train.values.ravel())
        
        df = pd.DataFrame(clf.cv_results_)
        
        train_predictions = clf.predict(X_train)
        test_predictions = clf.predict(X_test)
        unseen_predictions = clf.predict(X_val)
        unseen_predictions_proba = clf.predict_proba(X_val)[:, 1]
        unseen_predictions_threds = unseen_predictions_proba >= 0.4

        cfm_train = confusion_matrix(y_train, train_predictions)
        cfm_test = confusion_matrix(y_test, test_predictions)
        cfm_unseen = confusion_matrix(y_val, unseen_predictions)
        cfm_unseen_threds = confusion_matrix(y_val, unseen_predictions_threds)
        
        accs_train = recall_score(y_train, train_predictions)
        accs_test = recall_score(y_test, test_predictions)
        accs_unseen = recall_score(y_val, unseen_predictions)
        accs_unseen_threds = recall_score(y_val, unseen_predictions_threds)
        
        f1s_train_p1 = f1_score(y_train, train_predictions, pos_label=1)
        f1s_train_p0 = f1_score(y_train, train_predictions, pos_label=0)
        f1s_test_p1 = f1_score(y_test, test_predictions, pos_label=1)
        f1s_test_p0 = f1_score(y_test, test_predictions, pos_label=0)
        f1s_unseen_p1 = f1_score(y_val, unseen_predictions, pos_label=1)
        f1s_unseen_p0 = f1_score(y_val, unseen_predictions, pos_label=0)
        f1s_unseen_threds_p1 = f1_score(y_val, unseen_predictions_threds, pos_label=1)
        f1s_unseen_threds_p0 = f1_score(y_val, unseen_predictions_threds, pos_label=0)

        test_ras = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
        unseen_ras = roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])
    
        bp = clf.best_params_
        
        results_dict[f'xgbc{i+1}'] = {'iterable_parameter': grid_key,
                                    'classifier': deepcopy(clf),
                                    'cv_results': df.copy(),
                                    'cfm_train': cfm_train,
                                    'cfm_test': cfm_test,
                                    'cfm_unseen': cfm_unseen,
                                    'cfm_unseen_threds': cfm_unseen_threds,
                                    'train_recall': accs_train,
                                    'test_recall': accs_test,
                                    'unseen_recall': accs_unseen,
                                    'unseen_threds_recall': accs_unseen_threds,
                                    'train F1-score label 1': f1s_train_p1,
                                    'train F1-score label 0': f1s_train_p0,
                                    'test F1-score label 1': f1s_test_p1,
                                    'test F1-score label 0': f1s_test_p0,
                                    'unseen F1-score label 1': f1s_unseen_p1,
                                    'unseen F1-score label 0': f1s_unseen_p0,
                                    'unseen threds F1-score label 1': f1s_unseen_threds_p1,
                                    'unseen threds F1-score label 0': f1s_unseen_threds_p0,
                                    'test roc auc score': test_ras,
                                    'unseen roc auc score': unseen_ras,
                                    'best_params': bp,
                                    'predict_proba':unseen_predictions_proba}
                                    
    return results_dict, gcvj

In [95]:
def randomsearch_fit(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, default_params, results_dict, gcvj):
    """
    랜덤서치를 통해  최적의 하이퍼파라미터를 찾는 함수입니다.
    -----------------------------------------------------------------------------------------------
    input = X_train, y_train, X_test, y_test, X_val, y_val, param_grid, default_params, results_dict, gcvj
    return = 예측성능(dict)
    -----------------------------------------------------------------------------------------------
    """
    rcvj = gcvj

    default_params_xgb = {}

    for key in default_params.keys():
        default_params_xgb[key] = default_params[key][0]

    xgbc = xgb.XGBClassifier(**default_params_xgb)

    clf = RandomizedSearchCV(estimator=xgbc, param_distributions=param_grid, scoring='accuracy', return_train_score=True, verbose=0, cv=3, n_iter=rcvj)
    clf.fit(X_train, y_train.values.ravel())
        
    df = pd.DataFrame(clf.cv_results_)

    train_predictions = clf.predict(X_train)
    test_predictions = clf.predict(X_test)
    unseen_predictions = clf.predict(X_val)
    unseen_predictions_proba = clf.predict_proba(X_val)[:, 1]
    unseen_predictions_threds = unseen_predictions_proba >= 0.4

    cfm_train = confusion_matrix(y_train, train_predictions)
    cfm_test = confusion_matrix(y_test, test_predictions)
    cfm_unseen = confusion_matrix(y_val, unseen_predictions)
    cfm_unseen_threds = confusion_matrix(y_val, unseen_predictions_threds)
    
    accs_train = recall_score(y_train, train_predictions)
    accs_test = recall_score(y_test, test_predictions)
    accs_unseen = recall_score(y_val, unseen_predictions)
    accs_unseen_threds = recall_score(y_val, unseen_predictions_threds)
    
    f1s_train_p1 = f1_score(y_train, train_predictions, pos_label=1)
    f1s_train_p0 = f1_score(y_train, train_predictions, pos_label=0)
    f1s_test_p1 = f1_score(y_test, test_predictions, pos_label=1)
    f1s_test_p0 = f1_score(y_test, test_predictions, pos_label=0)
    f1s_unseen_p1 = f1_score(y_val, unseen_predictions, pos_label=1)
    f1s_unseen_p0 = f1_score(y_val, unseen_predictions, pos_label=0)
    f1s_unseen_threds_p1 = f1_score(y_val, unseen_predictions_threds, pos_label=1)
    f1s_unseen_threds_p0 = f1_score(y_val, unseen_predictions_threds, pos_label=0)
    
    test_ras = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    unseen_ras = roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])
    
    bp = clf.best_params_
    
    results_dict['xgbc_rcv'] = {'classifier': deepcopy(clf),
                                'cv_results': df.copy(),
                                'cfm_train': cfm_train,
                                'cfm_test': cfm_test,
                                'cfm_unseen': cfm_unseen,
                                'cfm_unseen_threds': cfm_unseen_threds,
                                'train_recall': accs_train,
                                'test_recall': accs_test,
                                'unseen_recall': accs_unseen,
                                'unseen_threds_recall': accs_unseen_threds,
                                'train F1-score label 1': f1s_train_p1,
                                'train F1-score label 0': f1s_train_p0,
                                'test F1-score label 1': f1s_test_p1,
                                'test F1-score label 0': f1s_test_p0,
                                'unseen F1-score label 1': f1s_unseen_p1,
                                'unseen F1-score label 0': f1s_unseen_p0,
                                'unseen threds F1-score label 1': f1s_unseen_threds_p1,
                                'unseen threds F1-score label 0': f1s_unseen_threds_p0,
                                'test roc auc score': test_ras,
                                'unseen roc auc score': unseen_ras,
                                'best_params': bp,
                                'predict_proba':unseen_predictions_proba}
                                
    return results_dict

In [96]:
def baysiansearch_fit(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, default_params, results_dict, gcvj):
    """
    베이지안서치를 통해  최적의 하이퍼파라미터를 찾는 함수입니다.
    -----------------------------------------------------------------------------------------------
    input = X_train, y_train, X_test, y_test, X_val, y_val, param_grid, default_params, results_dict, gcvj
    return = 예측성능(dict)
    -----------------------------------------------------------------------------------------------
    """
    bcvj = int(gcvj)

    default_params_xgb = {}

    for key in default_params.keys():
        default_params_xgb[key] = default_params[key][0]

    xgbc = xgb.XGBClassifier(**default_params_xgb)

    clf = BayesSearchCV(estimator=xgbc, search_spaces=param_grid, n_iter=bcvj, scoring='accuracy', cv=3, return_train_score=True, verbose=0)
    clf.fit(X_train, y_train.values.ravel())

    df = pd.DataFrame(clf.cv_results_)

    train_predictions = clf.predict(X_train)
    test_predictions = clf.predict(X_test)
    unseen_predictions = clf.predict(X_val)
    unseen_predictions_proba = clf.predict_proba(X_val)[:, 1]
    unseen_predictions_threds = unseen_predictions_proba >= 0.4

    cfm_train = confusion_matrix(y_train, train_predictions)
    cfm_test = confusion_matrix(y_test, test_predictions)
    cfm_unseen = confusion_matrix(y_val, unseen_predictions)
    cfm_unseen_threds = confusion_matrix(y_val, unseen_predictions_threds)
    
    accs_train = recall_score(y_train, train_predictions)
    accs_test = recall_score(y_test, test_predictions)
    accs_unseen = recall_score(y_val, unseen_predictions)
    accs_unseen_threds = recall_score(y_val, unseen_predictions_threds)
    
    f1s_train_p1 = f1_score(y_train, train_predictions, pos_label=1)
    f1s_train_p0 = f1_score(y_train, train_predictions, pos_label=0)
    f1s_test_p1 = f1_score(y_test, test_predictions, pos_label=1)
    f1s_test_p0 = f1_score(y_test, test_predictions, pos_label=0)
    f1s_unseen_p1 = f1_score(y_val, unseen_predictions, pos_label=1)
    f1s_unseen_p0 = f1_score(y_val, unseen_predictions, pos_label=0)
    f1s_unseen_threds_p1 = f1_score(y_val, unseen_predictions_threds, pos_label=1)
    f1s_unseen_threds_p0 = f1_score(y_val, unseen_predictions_threds, pos_label=0)
    
    test_ras = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    unseen_ras = roc_auc_score(y_val, clf.predict_proba(X_val)[:,1])
    
    bp = clf.best_params_
    
    results_dict['xgbc_bcv'] = {'classifier': deepcopy(clf),
                                'cv_results': df.copy(),
                                'cfm_train': cfm_train,
                                'cfm_test': cfm_test,
                                'cfm_unseen': cfm_unseen,
                                'cfm_unseen_threds': cfm_unseen_threds,
                                'train_recall': accs_train,
                                'test_recall': accs_test,
                                'unseen_recall': accs_unseen,
                                'unseen_threds_recall': accs_unseen_threds,
                                'train F1-score label 1': f1s_train_p1,
                                'train F1-score label 0': f1s_train_p0,
                                'test F1-score label 1': f1s_test_p1,
                                'test F1-score label 0': f1s_test_p0,
                                'unseen F1-score label 1': f1s_unseen_p1,
                                'unseen F1-score label 0': f1s_unseen_p0,
                                'unseen threds F1-score label 1': f1s_unseen_threds_p1,
                                'unseen threds F1-score label 0': f1s_unseen_threds_p0,
                                'test roc auc score': test_ras,
                                'unseen roc auc score': unseen_ras,
                                'best_params': bp,
                                'predict_proba':unseen_predictions_proba}
    
    return results_dict

In [97]:
def eval(results_dict):
    print("임계값 0.5 일 때, 예측 성능")
    for model in results_dict.keys():
        print(f"{model} - 재현율: {results_dict[model]['unseen_recall']}, F1-Score: {results_dict[model]['unseen F1-score label 1']}")

    print("임계값 0.4 일 때, 예측 성능")
    for model in results_dict.keys():
        print(f"{model} - 재현율: {results_dict[model]['unseen_threds_recall']}, F1-Score: {results_dict[model]['unseen threds F1-score label 1']}")

### 조합 1. 통계검정 재무/비재무, 통계검정 재무비율/파생재무비율, 재무등급, 지방지표, 재무비율점수

In [98]:
df_1 = pd.read_csv('./custom_data/gridsearch_data_1.csv', encoding='cp949')

X = df_1.drop(['휴폐업구분'], axis=1)
y = df_1['휴폐업구분']

X_train, y_train, X_test, y_test, X_val, y_val = preprocess_and_split_data(X, y)

훈련 데이터셋의 입력 데이터의 크기: (52527, 131), 타겟값의 크기: (52527,)
테스트 데이터셋의 입력 데이터의 크기: (14592, 131), 타겟값의 크기: (14592,)
검증 데이터셋의 입력 데이터의 크기: (5837, 131), 타겟값의 크기: (5837,)


In [99]:
#creating deepcopy of default parameters before manipulations
results_dict_base, default_params, gridsearch_estimator = baseline_fit(X_train, y_train, X_test, y_test, X_val, y_val)
params = deepcopy(default_params)

#setting grid of selected parameters for iteration
param_grid = {'gamma': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4, 200],
              'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.300000012, 0.4, 0.5, 0.6, 0.7],
              'max_depth': [5,6,7,8,9,10,11,12,13,14],
              'n_estimators': [50,65,80,100,115,130,150],
              'reg_alpha': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
              'reg_lambda': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200]}

results_dict_coord, gcvj = coordinate_descent(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, params, results_dict_base, gridsearch_estimator)
results_dict_random = randomsearch_fit(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, results_dict_coord, gcvj)
results_dict_final = baysiansearch_fit(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, results_dict_random, gcvj)

In [100]:
eval(results_dict_final)

임계값 0.5 일 때, 예측 성능
xgbc0 - 재현율: 0.6411960132890365, F1-Score: 0.7408829174664107
xgbc1 - 재현율: 0.6445182724252492, F1-Score: 0.7475915221579961
xgbc2 - 재현율: 0.6212624584717608, F1-Score: 0.7333333333333333
xgbc3 - 재현율: 0.6212624584717608, F1-Score: 0.7347740667976425
xgbc4 - 재현율: 0.6146179401993356, F1-Score: 0.7312252964426877
xgbc5 - 재현율: 0.6146179401993356, F1-Score: 0.7312252964426877
xgbc6 - 재현율: 0.6146179401993356, F1-Score: 0.7283464566929133
xgbc_rcv - 재현율: 0.627906976744186, F1-Score: 0.7382812500000001
xgbc_bcv - 재현율: 0.6112956810631229, F1-Score: 0.731610337972167
임계값 0.4 일 때, 예측 성능
xgbc0 - 재현율: 0.6777408637873754, F1-Score: 0.7597765363128492
xgbc1 - 재현율: 0.6644518272425249, F1-Score: 0.7352941176470589
xgbc2 - 재현율: 0.6511627906976745, F1-Score: 0.7340823970037454
xgbc3 - 재현율: 0.6445182724252492, F1-Score: 0.7376425855513308
xgbc4 - 재현율: 0.6345514950166113, F1-Score: 0.731800766283525
xgbc5 - 재현율: 0.6345514950166113, F1-Score: 0.731800766283525
xgbc6 - 재현율: 0.634551495016611

### 조합 2. 통계검정 재무/비재무, 통계검정 재무비율, 재무등급, 지방지표, 재무점수, 비재무점수

In [101]:
df_1 = pd.read_csv('./custom_data/gridsearch_data_2.csv', encoding='cp949')

X = df_1.drop(['휴폐업구분'], axis=1)
y = df_1['휴폐업구분']

X_train, y_train, X_test, y_test, X_val, y_val = preprocess_and_split_data(X, y)

훈련 데이터셋의 입력 데이터의 크기: (52527, 126), 타겟값의 크기: (52527,)
테스트 데이터셋의 입력 데이터의 크기: (14592, 126), 타겟값의 크기: (14592,)
검증 데이터셋의 입력 데이터의 크기: (5837, 126), 타겟값의 크기: (5837,)


In [102]:
#creating deepcopy of default parameters before manipulations
results_dict_base, default_params, gridsearch_estimator = baseline_fit(X_train, y_train, X_test, y_test, X_val, y_val)
params = deepcopy(default_params)

#setting grid of selected parameters for iteration
param_grid = {'gamma': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4, 200],
              'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.300000012, 0.4, 0.5, 0.6, 0.7],
              'max_depth': [5,6,7,8,9,10,11,12,13,14],
              'n_estimators': [50,65,80,100,115,130,150],
              'reg_alpha': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
              'reg_lambda': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200]}

results_dict_coord, gcvj = coordinate_descent(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, params, results_dict_base, gridsearch_estimator)
results_dict_random = randomsearch_fit(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, results_dict_coord, gcvj)
results_dict_final = baysiansearch_fit(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, results_dict_random, gcvj)

In [103]:
eval(results_dict_final)

임계값 0.5 일 때, 예측 성능
xgbc0 - 재현율: 0.6245847176079734, F1-Score: 0.7372549019607844
xgbc1 - 재현율: 0.6146179401993356, F1-Score: 0.7212475633528265
xgbc2 - 재현율: 0.6146179401993356, F1-Score: 0.7212475633528265
xgbc3 - 재현율: 0.6112956810631229, F1-Score: 0.71875
xgbc4 - 재현율: 0.6112956810631229, F1-Score: 0.71875
xgbc5 - 재현율: 0.6312292358803987, F1-Score: 0.7392996108949418
xgbc6 - 재현율: 0.6179401993355482, F1-Score: 0.7265625
xgbc_rcv - 재현율: 0.6245847176079734, F1-Score: 0.7387033398821219
xgbc_bcv - 재현율: 0.6378737541528239, F1-Score: 0.7500000000000001
임계값 0.4 일 때, 예측 성능
xgbc0 - 재현율: 0.6378737541528239, F1-Score: 0.7218045112781954
xgbc1 - 재현율: 0.6511627906976745, F1-Score: 0.7354596622889306
xgbc2 - 재현율: 0.6511627906976745, F1-Score: 0.7354596622889306
xgbc3 - 재현율: 0.6411960132890365, F1-Score: 0.7352380952380954
xgbc4 - 재현율: 0.6411960132890365, F1-Score: 0.7352380952380954
xgbc5 - 재현율: 0.6411960132890365, F1-Score: 0.7380497131931165
xgbc6 - 재현율: 0.6611295681063123, F1-Score: 0.739776951672

### 조합 3. 통계검정 재무/비재무, 통계검정 재무비율, 재무등급

In [106]:
df_1 = pd.read_csv('./custom_data/gridsearch_data_3.csv')

X = df_1.drop(['휴폐업구분'], axis=1)
y = df_1['휴폐업구분']

X_train, y_train, X_test, y_test, X_val, y_val = preprocess_and_split_data(X, y)

훈련 데이터셋의 입력 데이터의 크기: (52536, 110), 타겟값의 크기: (52536,)
테스트 데이터셋의 입력 데이터의 크기: (14594, 110), 타겟값의 크기: (14594,)
검증 데이터셋의 입력 데이터의 크기: (5838, 110), 타겟값의 크기: (5838,)


In [107]:
#creating deepcopy of default parameters before manipulations
results_dict_base, default_params, gridsearch_estimator = baseline_fit(X_train, y_train, X_test, y_test, X_val, y_val)
params = deepcopy(default_params)

#setting grid of selected parameters for iteration
param_grid = {'gamma': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4, 200],
              'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.300000012, 0.4, 0.5, 0.6, 0.7],
              'max_depth': [5,6,7,8,9,10,11,12,13,14],
              'n_estimators': [50,65,80,100,115,130,150],
              'reg_alpha': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
              'reg_lambda': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200]}

results_dict_coord, gcvj = coordinate_descent(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, params, results_dict_base, gridsearch_estimator)
results_dict_random = randomsearch_fit(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, results_dict_coord, gcvj)
results_dict_final = baysiansearch_fit(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, results_dict_random, gcvj)

In [108]:
eval(results_dict_final)

임계값 0.5 일 때, 예측 성능
xgbc0 - 재현율: 0.5860927152317881, F1-Score: 0.6954813359528488
xgbc1 - 재현율: 0.5960264900662252, F1-Score: 0.7086614173228347
xgbc2 - 재현율: 0.5960264900662252, F1-Score: 0.7086614173228347
xgbc3 - 재현율: 0.5761589403973509, F1-Score: 0.6918489065606361
xgbc4 - 재현율: 0.5761589403973509, F1-Score: 0.6918489065606361
xgbc5 - 재현율: 0.5761589403973509, F1-Score: 0.6918489065606361
xgbc6 - 재현율: 0.5993377483443708, F1-Score: 0.7125984251968503
xgbc_rcv - 재현율: 0.5927152317880795, F1-Score: 0.7075098814229249
xgbc_bcv - 재현율: 0.5728476821192053, F1-Score: 0.6824457593688363
임계값 0.4 일 때, 예측 성능
xgbc0 - 재현율: 0.6192052980132451, F1-Score: 0.7069943289224951
xgbc1 - 재현율: 0.6423841059602649, F1-Score: 0.7334593572778828
xgbc2 - 재현율: 0.6423841059602649, F1-Score: 0.7334593572778828
xgbc3 - 재현율: 0.6158940397350994, F1-Score: 0.711281070745698
xgbc4 - 재현율: 0.6158940397350994, F1-Score: 0.711281070745698
xgbc5 - 재현율: 0.6158940397350994, F1-Score: 0.711281070745698
xgbc6 - 재현율: 0.63245033112582

# 베이스라인

In [113]:
df_1 = pd.read_csv('./custom_data/baseline_data.csv', encoding='cp949')

X = df_1.drop(['휴폐업구분'], axis=1)
y = df_1['휴폐업구분']

X_train, y_train, X_test, y_test, X_val, y_val = preprocess_and_split_data(X, y)

훈련 데이터셋의 입력 데이터의 크기: (52536, 55), 타겟값의 크기: (52536,)
테스트 데이터셋의 입력 데이터의 크기: (14594, 55), 타겟값의 크기: (14594,)
검증 데이터셋의 입력 데이터의 크기: (5838, 55), 타겟값의 크기: (5838,)


In [114]:
#creating deepcopy of default parameters before manipulations
results_dict_base, default_params, gridsearch_estimator = baseline_fit(X_train, y_train, X_test, y_test, X_val, y_val)
params = deepcopy(default_params)

#setting grid of selected parameters for iteration
param_grid = {'gamma': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4, 200],
              'learning_rate': [0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.300000012, 0.4, 0.5, 0.6, 0.7],
              'max_depth': [5,6,7,8,9,10,11,12,13,14],
              'n_estimators': [50,65,80,100,115,130,150],
              'reg_alpha': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
              'reg_lambda': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200]}

results_dict_coord, gcvj = coordinate_descent(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, params, results_dict_base, gridsearch_estimator)
results_dict_random = randomsearch_fit(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, results_dict_coord, gcvj)
results_dict_final = baysiansearch_fit(X_train, y_train, X_test, y_test, X_val, y_val, param_grid, 
                                                default_params, results_dict_random, gcvj)

In [None]:
results_dict_final

In [123]:
eval(results_dict_final)

임계값 0.5 일 때, 예측 성능
xgbc0 - 재현율: 0.6125827814569537, F1-Score: 0.7156673114119921
xgbc1 - 재현율: 0.6026490066225165, F1-Score: 0.7123287671232875
xgbc2 - 재현율: 0.609271523178808, F1-Score: 0.7159533073929961
xgbc3 - 재현율: 0.609271523178808, F1-Score: 0.7159533073929961
xgbc4 - 재현율: 0.6125827814569537, F1-Score: 0.7184466019417476
xgbc5 - 재현율: 0.6125827814569537, F1-Score: 0.7184466019417476
xgbc6 - 재현율: 0.6026490066225165, F1-Score: 0.7109374999999999
xgbc_rcv - 재현율: 0.609271523178808, F1-Score: 0.7215686274509804
xgbc_bcv - 재현율: 0.6059602649006622, F1-Score: 0.7190569744597249
임계값 0.4 일 때, 예측 성능
xgbc0 - 재현율: 0.6357615894039735, F1-Score: 0.7218045112781956
xgbc1 - 재현율: 0.6324503311258278, F1-Score: 0.7193973634651599
xgbc2 - 재현율: 0.6390728476821192, F1-Score: 0.7269303201506591
xgbc3 - 재현율: 0.6390728476821192, F1-Score: 0.7269303201506591
xgbc4 - 재현율: 0.6423841059602649, F1-Score: 0.7265917602996254
xgbc5 - 재현율: 0.6423841059602649, F1-Score: 0.7265917602996254
xgbc6 - 재현율: 0.63245033112582