### 데이터 기반으로 모델 적합하여 결과 확인

- tree, rforest, mlp / 기존 논문과 동일하게
- 기준금리-준칙금리 차이를 넣어서 계산

In [1]:
import os

import pandas as pd

from tqdm import tqdm

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn import tree
import graphviz
    
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.utils import resample

In [2]:
# matplotlib 한글 폰트 셋업
import matplotlib as mpl
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
!apt -qq -y install fonts-nanum
 
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
fm.fontManager.addfont(fontpath)  # 에전의 rebuild 대신 이걸 추가해야 설치한 폰트 추가됨
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic')

plt.rc('axes', unicode_minus=False)  # Glyph 8722 방지용

fonts-nanum is already the newest version (20200506-1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [3]:
data = pd.read_csv("./DATA/merged_data.csv", index_col=0)
data.index.name = "date"
data

Unnamed: 0_level_0,cosine_similarity,dissimilarity,base_rate,base_rate_diff,FFR,ko-us_gap,governor,president,tr1,tr2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1999-01-07,,,,,4.750,,전철환,김대중,,
1999-02-04,0.727940,0.272060,,,4.750,,전철환,김대중,,
1999-03-04,0.733303,0.266697,,,4.750,,전철환,김대중,,
1999-04-08,0.811213,0.188787,,,4.750,,전철환,김대중,,
1999-05-06,0.759641,0.240359,4.75,,4.750,0.000,전철환,김대중,,
...,...,...,...,...,...,...,...,...,...,...
2022-11-24,0.917186,0.082814,3.25,1.0,3.875,-0.625,이창용,윤석열,7.225832,4.914102
2023-01-13,0.928007,0.071993,3.50,1.0,4.375,-0.875,이창용,윤석열,6.250150,1.879616
2023-02-23,0.926393,0.073607,3.50,0.0,4.625,-1.125,이창용,윤석열,6.250150,1.879616
2023-04-11,0.917157,0.082843,3.50,0.0,4.875,-1.375,이창용,윤석열,6.250150,1.879616


In [4]:
# 변수들에 lag
data["ko-us_gap_t1"] = data["ko-us_gap"].shift(1)
data

Unnamed: 0_level_0,cosine_similarity,dissimilarity,base_rate,base_rate_diff,FFR,ko-us_gap,governor,president,tr1,tr2,ko-us_gap_t1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1999-01-07,,,,,4.750,,전철환,김대중,,,
1999-02-04,0.727940,0.272060,,,4.750,,전철환,김대중,,,
1999-03-04,0.733303,0.266697,,,4.750,,전철환,김대중,,,
1999-04-08,0.811213,0.188787,,,4.750,,전철환,김대중,,,
1999-05-06,0.759641,0.240359,4.75,,4.750,0.000,전철환,김대중,,,
...,...,...,...,...,...,...,...,...,...,...,...
2022-11-24,0.917186,0.082814,3.25,1.0,3.875,-0.625,이창용,윤석열,7.225832,4.914102,-0.125
2023-01-13,0.928007,0.071993,3.50,1.0,4.375,-0.875,이창용,윤석열,6.250150,1.879616,-0.625
2023-02-23,0.926393,0.073607,3.50,0.0,4.625,-1.125,이창용,윤석열,6.250150,1.879616,-0.875
2023-04-11,0.917157,0.082843,3.50,0.0,4.875,-1.375,이창용,윤석열,6.250150,1.879616,-1.125


In [5]:
dissim = data[["dissimilarity"]].copy()
dissim.columns = ["dissim_t"]

for lag in range(1, 6+1):
    dissim[f"dissim_t{lag}"] = dissim["dissim_t"].shift(lag)
dissim

Unnamed: 0_level_0,dissim_t,dissim_t1,dissim_t2,dissim_t3,dissim_t4,dissim_t5,dissim_t6
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1999-01-07,,,,,,,
1999-02-04,0.272060,,,,,,
1999-03-04,0.266697,0.272060,,,,,
1999-04-08,0.188787,0.266697,0.272060,,,,
1999-05-06,0.240359,0.188787,0.266697,0.272060,,,
...,...,...,...,...,...,...,...
2022-11-24,0.082814,0.092589,0.103785,0.086887,0.059259,0.065588,0.096154
2023-01-13,0.071993,0.082814,0.092589,0.103785,0.086887,0.059259,0.065588
2023-02-23,0.073607,0.071993,0.082814,0.092589,0.103785,0.086887,0.059259
2023-04-11,0.082843,0.073607,0.071993,0.082814,0.092589,0.103785,0.086887


In [6]:
df = pd.merge(dissim.drop("dissim_t", axis=1), 
              data[["ko-us_gap_t1", "base_rate_diff", "tr1", "tr2", "governor", "president"]], 
              left_index=True, right_index=True)
df

Unnamed: 0_level_0,dissim_t1,dissim_t2,dissim_t3,dissim_t4,dissim_t5,dissim_t6,ko-us_gap_t1,base_rate_diff,tr1,tr2,governor,president
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1999-01-07,,,,,,,,,,,전철환,김대중
1999-02-04,,,,,,,,,,,전철환,김대중
1999-03-04,0.272060,,,,,,,,,,전철환,김대중
1999-04-08,0.266697,0.272060,,,,,,,,,전철환,김대중
1999-05-06,0.188787,0.266697,0.272060,,,,,,,,전철환,김대중
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-24,0.092589,0.103785,0.086887,0.059259,0.065588,0.096154,-0.125,1.0,7.225832,4.914102,이창용,윤석열
2023-01-13,0.082814,0.092589,0.103785,0.086887,0.059259,0.065588,-0.625,1.0,6.250150,1.879616,이창용,윤석열
2023-02-23,0.071993,0.082814,0.092589,0.103785,0.086887,0.059259,-0.875,0.0,6.250150,1.879616,이창용,윤석열
2023-04-11,0.073607,0.071993,0.082814,0.092589,0.103785,0.086887,-1.125,0.0,6.250150,1.879616,이창용,윤석열


In [7]:
# dissim = pd.read_csv("data_비유사도_변수_통합.csv", index_col=0)
# dissim

In [8]:
# df = pd.merge(dissim, data[["governor", "president"]], left_index=True, right_index=True)
# df

In [9]:
# base_rate_diff가 있는 부분만 잡기
df = df.loc[df["base_rate_diff"].dropna().index, :]
df["base_rate_diff"] = df["base_rate_diff"].astype(int)
df

Unnamed: 0_level_0,dissim_t1,dissim_t2,dissim_t3,dissim_t4,dissim_t5,dissim_t6,ko-us_gap_t1,base_rate_diff,tr1,tr2,governor,president
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1999-06-03,0.240359,0.188787,0.266697,0.272060,,,0.000,0,,,전철환,김대중
1999-07-07,0.225633,0.240359,0.188787,0.266697,0.272060,,0.000,0,,,전철환,김대중
1999-08-05,0.167357,0.225633,0.240359,0.188787,0.266697,0.272060,-0.250,0,,,전철환,김대중
1999-09-02,0.483118,0.167357,0.225633,0.240359,0.188787,0.266697,-0.250,0,,,전철환,김대중
1999-10-07,0.309743,0.483118,0.167357,0.225633,0.240359,0.188787,-0.500,0,,,전철환,김대중
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-24,0.092589,0.103785,0.086887,0.059259,0.065588,0.096154,-0.125,1,7.225832,4.914102,이창용,윤석열
2023-01-13,0.082814,0.092589,0.103785,0.086887,0.059259,0.065588,-0.625,1,6.250150,1.879616,이창용,윤석열
2023-02-23,0.071993,0.082814,0.092589,0.103785,0.086887,0.059259,-0.875,0,6.250150,1.879616,이창용,윤석열
2023-04-11,0.073607,0.071993,0.082814,0.092589,0.103785,0.086887,-1.125,0,6.250150,1.879616,이창용,윤석열


In [10]:
# # t+1 시점의 결정문 변화 방향으로 설정함 (즉 t기의 데이터를 가지고 t+1기의 변동을 맞추는 식) / t-1, t로 바꿔도 같은 이야기임
# df[["base_rate_diff"]].shift(-1)
# > 여기서 적용하지 않고, 뒷부분 분석 돌릴 때 y를 만들 떄 적용함

In [11]:
# 전철환 총재의 기간을 제외하기 (08.19 업데이트 사항)
df = df.loc[df["governor"] != "전철환", :]
df.info()
df

<class 'pandas.core.frame.DataFrame'>
Index: 229 entries, 2002-04-04 to 2023-05-25
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   dissim_t1       229 non-null    float64
 1   dissim_t2       229 non-null    float64
 2   dissim_t3       229 non-null    float64
 3   dissim_t4       229 non-null    float64
 4   dissim_t5       229 non-null    float64
 5   dissim_t6       229 non-null    float64
 6   ko-us_gap_t1    229 non-null    float64
 7   base_rate_diff  229 non-null    int64  
 8   tr1             229 non-null    float64
 9   tr2             223 non-null    float64
 10  governor        229 non-null    object 
 11  president       229 non-null    object 
dtypes: float64(9), int64(1), object(2)
memory usage: 23.3+ KB


Unnamed: 0_level_0,dissim_t1,dissim_t2,dissim_t3,dissim_t4,dissim_t5,dissim_t6,ko-us_gap_t1,base_rate_diff,tr1,tr2,governor,president
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2002-04-04,0.246331,0.154022,0.201537,0.229776,0.402434,0.343264,2.250,0,4.319381,,박승,김대중
2002-05-07,0.315568,0.246331,0.154022,0.201537,0.229776,0.402434,2.250,1,4.319381,,박승,김대중
2002-06-05,0.386937,0.315568,0.246331,0.154022,0.201537,0.229776,2.500,0,4.319381,,박승,김대중
2002-07-04,0.445624,0.386937,0.315568,0.246331,0.154022,0.201537,2.500,0,4.345861,,박승,김대중
2002-08-06,0.320915,0.445624,0.386937,0.315568,0.246331,0.154022,2.500,0,4.345861,,박승,김대중
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-24,0.092589,0.103785,0.086887,0.059259,0.065588,0.096154,-0.125,1,7.225832,4.914102,이창용,윤석열
2023-01-13,0.082814,0.092589,0.103785,0.086887,0.059259,0.065588,-0.625,1,6.250150,1.879616,이창용,윤석열
2023-02-23,0.071993,0.082814,0.092589,0.103785,0.086887,0.059259,-0.875,0,6.250150,1.879616,이창용,윤석열
2023-04-11,0.073607,0.071993,0.082814,0.092589,0.103785,0.086887,-1.125,0,6.250150,1.879616,이창용,윤석열


### 적합용 함수

- tree
- rforest
- 신경망

In [12]:
df.columns

Index(['dissim_t1', 'dissim_t2', 'dissim_t3', 'dissim_t4', 'dissim_t5',
       'dissim_t6', 'ko-us_gap_t1', 'base_rate_diff', 'tr1', 'tr2', 'governor',
       'president'],
      dtype='object')

In [13]:
# DATA
# TODO: 한 데이터(X, y)에 대해 모델들 쭉 돌리는 형태로 구현하기 (이걸로 tr1, tr2랑 for문도 돌릴 수 있음)
cols = ['dissim_t1', 'dissim_t2', 'dissim_t3', 'dissim_t4', 'dissim_t5', 'dissim_t6', 'base_rate_diff', 'ko-us_gap_t1',
        'tr1',
        # 'tr2',
        ]
temp = df[cols].copy()
temp["base_rate_diff"] = temp[["base_rate_diff"]].shift(-1)
temp = temp.dropna()
temp["base_rate_diff"] = temp["base_rate_diff"].astype(int)

X = temp.drop(["base_rate_diff"], axis=1)
y = temp["base_rate_diff"]

len(X), len(y), len(df)

(228, 228, 229)

In [14]:
# 붓스트랩 테스트
bootstrap_idx = resample(X.index, replace=True, n_samples=50, random_state=0)
bt_X = X.loc[bootstrap_idx, :]
bt_y = y.loc[bootstrap_idx]
len(bt_X), len(bt_y), bootstrap_idx

(50,
 50,
 Index(['2016-09-09', '2006-03-09', '2012-02-09', '2019-01-24', '2007-11-08',
        '2019-05-31', '2010-11-16', '2003-01-09', '2021-04-15', '2004-01-08',
        '2005-04-07', '2009-07-09', '2008-02-13', '2021-11-25', '2009-08-11',
        '2014-01-09', '2007-02-08', '2019-02-28', '2005-07-07', '2009-07-09',
        '2016-11-11', '2009-08-11', '2009-01-10', '2016-02-16', '2004-05-06',
        '2008-09-11', '2008-04-10', '2003-01-09', '2014-09-12', '2011-12-08',
        '2020-11-26', '2019-08-30', '2008-11-07', '2016-12-15', '2019-01-24',
        '2009-02-12', '2010-07-09', '2021-11-25', '2017-02-23', '2004-09-09',
        '2014-08-14', '2014-08-14', '2014-03-13', '2016-04-19', '2004-12-09',
        '2019-02-28', '2003-01-09', '2018-02-27', '2012-12-13', '2004-12-09'],
       dtype='object', name='date'))

#### Tree

In [15]:
### 트리 적합 합수 ###

def fit_dt(X, y, bt_X, bt_y, title="TEST", save_dir_plot=None, model_args={"max_depth": 4}):
    """주어진 데이터에 대해 트리 모형을 적합하고, 적합한 결과를 반환한다.
    
    Args:
        X(dataframe): fit의 X 데이터
        y(series): fit의 y데이터
        bt_X(dataframe): fit의 X 데이터 - 부트스트랩으로 생성
        bt_y(series): fit의 y데이터 - 부트스트랩으로 생성
        title(str): 그림 저장 시에 사용할 타이틀 베이스 (이 타이틀에 _roc_curve 같은 접미사를 붙여 저장함)
        save_dir_plot(str): 생성할 그림을 저장할 디렉토리, None일 경우 저장하지 않음,
        model_args(dict): 모델 정의 시 사용할 하이퍼파라미터 argument
    
    Returns:
        dict: 결과물을 담은 딕셔너리
            result:
                ACC: 정확도
                ROC_AUC: ROC_AUC 점수
            raw: 
                model: 적합한 sklearn 모델 저장
                feature_importances: feature importances 를 담은 df
            predict: X에 대해 predict한 결과물이 담긴 series (index=X와 같음)
            predict_proba: X에 대해 predict_proba[:, 1]한 결과물이 담긴 series (index=X와 같음)
    """
    # 모델
    model = DecisionTreeClassifier(criterion='gini', max_depth=model_args["max_depth"], random_state=0)
    model.fit(X, y)
    
    # predict
    pred_y = model.predict(X)
    pred_proba_y = model.predict_proba(X)[:, 1]
    
    bt_pred_y = model.predict(bt_X)
    bt_pred_proba_y = model.predict_proba(bt_X)[:, 1]
    
    # 스코어
    acc = accuracy_score(y, pred_y)
    roc_auc = roc_auc_score(y, pred_proba_y)
    
    bt_acc = accuracy_score(bt_y, bt_pred_y)
    bt_roc_auc = roc_auc_score(bt_y, bt_pred_proba_y)
    
    # plot - roc curve
    fper, tper, thresholds = roc_curve(y, pred_proba_y)
    plt.title("ROC Curve")
    plt.plot(fper, tper, color='red', label='ROC')
    plt.xlabel('1-TNR')
    plt.ylabel('Recall')
    if save_dir_plot:
        plt.savefig(f"{save_dir_plot}/{title}_ROC_Curve.png", dpi=300)
    plt.close()  # 플롯 초기화(그림 출력 방지)
    
    # plot - roc curve - bt
    fper, tper, thresholds = roc_curve(bt_y, bt_pred_proba_y)
    plt.title("ROC Curve")
    plt.plot(fper, tper, color='red', label='ROC')
    plt.xlabel('1-TNR')
    plt.ylabel('Recall')
    if save_dir_plot:
        plt.savefig(f"{save_dir_plot}/{title}_ROC_Curve_bootstrap.png", dpi=300)
    plt.close()  # 플롯 초기화(그림 출력 방지)
    
    # plot - feature_importances
    data = {"feature importance": model.feature_importances_, "feature": X.columns}
    fdf = pd.DataFrame(data)
    fdf.set_index("feature", inplace=True)
    fdf.sort_values("feature importance", inplace=True, ascending=True)

    fdf.plot.barh(align="center")
    plt.xlabel('feature importance')
    plt.ylabel('feature')
    plt.grid(alpha=0.3)
    
    if save_dir_plot:
        plt.savefig(f"{save_dir_plot}/{title}_feature_importance.png", dpi=300)
    plt.close()  # 플롯 초기화(그림 출력 방지)
    
    # plot - 의사결정나무 플롯
    # fig = plt.figure(figsize=(15, 8))
    _ = tree.plot_tree(model, 
                      # feature_names=['x1', 'x2'],
                      class_names=["keep", "change"],
                      filled=True,
                      fontsize=8
                      )
    if save_dir_plot:
        plt.savefig(f"{save_dir_plot}/{title}_dt_tree_graph.png", dpi=300)
    plt.close()  # 플롯 초기화(그림 출력 방지)
    
        
    # 결과 저장
    result = {"result": {}, "raw": {}}
    result["result"]["ACC"] = acc
    result["result"]["ROC_AUC"] = roc_auc
    result["result"]["BT_ACC"] = bt_acc
    result["result"]["BT_ROC_AUC"] = bt_roc_auc
    # result["raw"]["feature_importance"] = fdf
    result["raw"]["feature_importance"] = fdf.sort_values("feature importance", ascending=False)
    result["raw"]["model"] = model
    result["predict"] = pd.Series(pred_y, index=X.index)
    result["predict_proba"] = pd.Series(pred_proba_y, index=X.index)
    result["BT_predict"] = pd.Series(bt_pred_y, index=bt_X.index)
    result["BT_predict_proba"] = pd.Series(bt_pred_proba_y, index=bt_X.index)
    
    return result
   

In [16]:
result = fit_dt(X, y, bt_X, bt_y, title="TEST_TITLE", save_dir_plot="./temp", model_args={"max_depth": 4})
print(result["result"])

{'ACC': 0.8552631578947368, 'ROC_AUC': 0.7275772892911719, 'BT_ACC': 0.88, 'BT_ROC_AUC': 0.7767857142857142}


In [17]:
result["raw"]["feature_importance"]

Unnamed: 0_level_0,feature importance
feature,Unnamed: 1_level_1
tr1,0.430915
dissim_t3,0.268076
ko-us_gap_t1,0.134527
dissim_t4,0.087697
dissim_t1,0.078786
dissim_t2,0.0
dissim_t5,0.0
dissim_t6,0.0


In [18]:
result["predict"]

date
2002-04-04    0
2002-05-07    0
2002-06-05    0
2002-07-04    0
2002-08-06    0
             ..
2022-10-12    1
2022-11-24    1
2023-01-13    0
2023-02-23    0
2023-04-11    0
Length: 228, dtype: int64

In [19]:
result["predict_proba"]

date
2002-04-04    0.102564
2002-05-07    0.102564
2002-06-05    0.102564
2002-07-04    0.102564
2002-08-06    0.102564
                ...   
2022-10-12    1.000000
2022-11-24    1.000000
2023-01-13    0.102564
2023-02-23    0.102564
2023-04-11    0.102564
Length: 228, dtype: float64

In [20]:
result["raw"]["model"]

#### RFroest

In [21]:
def fit_rforest(X, y, bt_X, bt_y, title="TEST", save_dir_plot=None, model_args={"n_estimators": 50, "max_depth": 4}):
    """주어진 데이터에 대해 랜덤 포레스트 모형을 적합하고, 적합한 결과를 반환한다.
    
    Args:
        X(dataframe): fit의 X 데이터
        y(series): fit의 y데이터
        bt_X(dataframe): fit의 X 데이터 - 부트스트랩으로 생성
        bt_y(series): fit의 y데이터 - 부트스트랩으로 생성
        title(str): 그림 저장 시에 사용할 타이틀 베이스 (이 타이틀에 _roc_curve 같은 접미사를 붙여 저장함)
        save_dir_plot(str): 생성할 그림을 저장할 디렉토리, None일 경우 저장하지 않음,
        model_args(dict): 모델 정의 시 사용할 하이퍼파라미터 argument
    
    Returns:
        dict: 결과물을 담은 딕셔너리
            result:
                ACC: 정확도
                ROC_AUC: ROC_AUC 점수
            raw: 
                model: 적합한 sklearn 모델 저장
                feature_importances: feature importances 를 담은 df
            predict: X에 대해 predict한 결과물이 담긴 series (index=X와 같음)
            predict_proba: X에 대해 predict_proba[:, 1]한 결과물이 담긴 series (index=X와 같음)
    """
    # 모델
    model = RandomForestClassifier(n_estimators=model_args["n_estimators"], max_depth=model_args["max_depth"], random_state=0)
    model.fit(X, y)
    
    # predict
    pred_y = model.predict(X)
    pred_proba_y = model.predict_proba(X)[:, 1]
    
    bt_pred_y = model.predict(bt_X)
    bt_pred_proba_y = model.predict_proba(bt_X)[:, 1]
    
    # 스코어
    acc = accuracy_score(y, pred_y)
    roc_auc = roc_auc_score(y, pred_proba_y)
    
    bt_acc = accuracy_score(bt_y, bt_pred_y)
    bt_roc_auc = roc_auc_score(bt_y, bt_pred_proba_y)
    
    # plot - roc curve
    fper, tper, thresholds = roc_curve(y, pred_proba_y)
    plt.title("ROC Curve")
    plt.plot(fper, tper, color='red', label='ROC')
    plt.xlabel('1-TNR')
    plt.ylabel('Recall')
    if save_dir_plot:
        plt.savefig(f"{save_dir_plot}/{title}_ROC_Curve.png", dpi=300)
    plt.close()  # 플롯 초기화(그림 출력 방지)
    
    # plot - roc curve - bt
    fper, tper, thresholds = roc_curve(bt_y, bt_pred_proba_y)
    plt.title("ROC Curve")
    plt.plot(fper, tper, color='red', label='ROC')
    plt.xlabel('1-TNR')
    plt.ylabel('Recall')
    if save_dir_plot:
        plt.savefig(f"{save_dir_plot}/{title}_ROC_Curve_bootstrap.png", dpi=300)
    plt.close()  # 플롯 초기화(그림 출력 방지)
    
    # plot - feature_importances
    data = {"feature importance": model.feature_importances_, "feature": X.columns}
    fdf = pd.DataFrame(data)
    fdf.set_index("feature", inplace=True)
    fdf.sort_values("feature importance", inplace=True, ascending=True)

    fdf.plot.barh(align="center")
    plt.xlabel('feature importance')
    plt.ylabel('feature')
    plt.grid(alpha=0.3)
    
    if save_dir_plot:
        plt.savefig(f"{save_dir_plot}/{title}_feature_importance.png", dpi=300)
    plt.close()  # 플롯 초기화(그림 출력 방지)
        
    # 결과 저장
    result = {"result": {}, "raw": {}}
    result["result"]["ACC"] = acc
    result["result"]["ROC_AUC"] = roc_auc
    result["result"]["BT_ACC"] = bt_acc
    result["result"]["BT_ROC_AUC"] = bt_roc_auc
    result["raw"]["feature_importance"] = fdf.sort_values("feature importance", ascending=False)
    result["raw"]["model"] = model
    result["predict"] = pd.Series(pred_y, index=X.index)
    result["predict_proba"] = pd.Series(pred_proba_y, index=X.index)
    result["BT_predict"] = pd.Series(bt_pred_y, index=bt_X.index)
    result["BT_predict_proba"] = pd.Series(bt_pred_proba_y, index=bt_X.index)
    
    return result
   

In [22]:
result = fit_rforest(X, y, bt_X, bt_y, title="TEST_TITLE_rf", save_dir_plot="./temp", model_args={"n_estimators": 50, "max_depth": 4})
print(result["result"])

{'ACC': 0.8289473684210527, 'ROC_AUC': 0.9652051251910192, 'BT_ACC': 0.86, 'BT_ROC_AUC': 0.994047619047619}


In [23]:
result["raw"]["feature_importance"]

Unnamed: 0_level_0,feature importance
feature,Unnamed: 1_level_1
tr1,0.32604
dissim_t3,0.117562
dissim_t4,0.110695
dissim_t1,0.101726
ko-us_gap_t1,0.100285
dissim_t6,0.084096
dissim_t2,0.083323
dissim_t5,0.076273


In [24]:
result["predict"]

date
2002-04-04    0
2002-05-07    0
2002-06-05    0
2002-07-04    0
2002-08-06    0
             ..
2022-10-12    1
2022-11-24    1
2023-01-13    0
2023-02-23    0
2023-04-11    0
Length: 228, dtype: int64

In [25]:
result["predict_proba"]

date
2002-04-04    0.243985
2002-05-07    0.124315
2002-06-05    0.111988
2002-07-04    0.186907
2002-08-06    0.099090
                ...   
2022-10-12    0.769905
2022-11-24    0.679278
2023-01-13    0.173286
2023-02-23    0.194553
2023-04-11    0.213318
Length: 228, dtype: float64

#### MLP

In [26]:
def fit_mlp(X, y, bt_X, bt_y, title="TEST", save_dir_plot=None, model_args={"hidden_layer_size": (10, 12, 8), "max_iter": 1000}):
    """주어진 데이터에 대해 MLP 적합하고, 적합한 결과를 반환한다.
    
    Args:
        X(dataframe): fit의 X 데이터
        y(series): fit의 y데이터
        bt_X(dataframe): fit의 X 데이터 - 부트스트랩으로 생성
        bt_y(series): fit의 y데이터 - 부트스트랩으로 생성
        title(str): 그림 저장 시에 사용할 타이틀 베이스 (이 타이틀에 _roc_curve 같은 접미사를 붙여 저장함)
        save_dir_plot(str): 생성할 그림을 저장할 디렉토리, None일 경우 저장하지 않음,
        model_args(dict): 모델 정의 시 사용할 하이퍼파라미터 argument
    
    Returns:
        dict: 결과물을 담은 딕셔너리
            result:
                ACC: 정확도
                ROC_AUC: ROC_AUC 점수
            raw: 
                model: 적합한 sklearn 모델 저장
            predict: X에 대해 predict한 결과물이 담긴 series (index=X와 같음)
            predict_proba: X에 대해 predict_proba[:, 1]한 결과물이 담긴 series (index=X와 같음)
    """
    # 모델
    model = MLPClassifier(solver='adam', alpha=1e-5, random_state=0, 
                          hidden_layer_sizes=model_args["hidden_layer_size"], max_iter=model_args["max_iter"])
    model.fit(X, y)
    
    # predict
    pred_y = model.predict(X)
    pred_proba_y = model.predict_proba(X)[:, 1]
    
    bt_pred_y = model.predict(bt_X)
    bt_pred_proba_y = model.predict_proba(bt_X)[:, 1]
    
    # 스코어
    acc = accuracy_score(y, pred_y)
    roc_auc = roc_auc_score(y, pred_proba_y)
    
    bt_acc = accuracy_score(bt_y, bt_pred_y)
    bt_roc_auc = roc_auc_score(bt_y, bt_pred_proba_y)
    
    # plot - roc curve
    fper, tper, thresholds = roc_curve(y, pred_proba_y)
    plt.title("ROC Curve")
    plt.plot(fper, tper, color='red', label='ROC')
    plt.xlabel('1-TNR')
    plt.ylabel('Recall')
    if save_dir_plot:
        plt.savefig(f"{save_dir_plot}/{title}_ROC_Curve.png", dpi=300)
    plt.close()  # 플롯 초기화(그림 출력 방지)
    
    # plot - roc curve - bt
    fper, tper, thresholds = roc_curve(bt_y, bt_pred_proba_y)
    plt.title("ROC Curve")
    plt.plot(fper, tper, color='red', label='ROC')
    plt.xlabel('1-TNR')
    plt.ylabel('Recall')
    if save_dir_plot:
        plt.savefig(f"{save_dir_plot}/{title}_ROC_Curve_bootstrap.png", dpi=300)
    plt.close()  # 플롯 초기화(그림 출력 방지)
    
    # 결과 저장
    result = {"result": {}, "raw": {}}
    result["result"]["ACC"] = acc
    result["result"]["ROC_AUC"] = roc_auc
    result["result"]["BT_ACC"] = bt_acc
    result["result"]["BT_ROC_AUC"] = bt_roc_auc
    result["raw"]["model"] = model
    result["predict"] = pd.Series(pred_y, index=X.index)
    result["predict_proba"] = pd.Series(pred_proba_y, index=X.index)
    result["BT_predict"] = pd.Series(bt_pred_y, index=bt_X.index)
    result["BT_predict_proba"] = pd.Series(bt_pred_proba_y, index=bt_X.index)
    
    return result


In [27]:
result = fit_mlp(X, y, bt_X, bt_y, title="TEST_TITLE_mlp", save_dir_plot="./temp", model_args={"hidden_layer_size": (10, 12, 8), "max_iter": 1000})
print(result["result"])

{'ACC': 0.8333333333333334, 'ROC_AUC': 0.6861408252027742, 'BT_ACC': 0.88, 'BT_ROC_AUC': 0.7351190476190476}


In [28]:
result["predict"]

date
2002-04-04    0
2002-05-07    0
2002-06-05    0
2002-07-04    0
2002-08-06    0
             ..
2022-10-12    1
2022-11-24    1
2023-01-13    0
2023-02-23    0
2023-04-11    0
Length: 228, dtype: int64

In [29]:
result["predict_proba"]

date
2002-04-04    0.106229
2002-05-07    0.111386
2002-06-05    0.120212
2002-07-04    0.102532
2002-08-06    0.090055
                ...   
2022-10-12    0.559075
2022-11-24    0.556123
2023-01-13    0.416268
2023-02-23    0.380381
2023-04-11    0.348999
Length: 228, dtype: float64

### 데이터별로 모델 적합 후 저장

- 전체
- 총재별
- 정권별

In [30]:
# # 전체 데이터 적합 >> for문 안에 전부 구현한 버전 > 아래에 함수로 심플하게 래핑한거 있음, 이건 참고용으로 두고 실사용은 그것으로
# for tr in tqdm(["tr1", "tr2"]):
#     # TITLE
#     title_base = f"ALL_{tr.upper()}"  # 전체 데이터
#     save_dir = "./temp"
#     save_dir_plot = "./temp/plot"
    
#     if not os.path.isdir(save_dir):
#         os.mkdir(save_dir)
    
#     if not os.path.isdir(save_dir_plot):
#         os.mkdir(save_dir_plot)
    
#     # DATA
#     cols = ['dissim_t1', 'dissim_t2', 'dissim_t3', 'dissim_t4', 'dissim_t5', 'dissim_t6', 'base_rate_diff', 'FFR', 'ko-us_gap', tr]
#     temp = df[cols].copy()

#     temp["base_rate_diff"] = temp[["base_rate_diff"]].shift(-1)  # 다음 기의 값 예측
#     temp = temp.dropna()
#     temp["base_rate_diff"] = temp["base_rate_diff"].astype(int)

#     X = temp.drop(["base_rate_diff"], axis=1)
#     y = temp["base_rate_diff"]
    
#     # FIT
#     out_dt = fit_dt(X, y, title=f"{title_base}_dt", save_dir_plot=save_dir_plot, 
#                     model_args={"max_depth": 4})
    
#     out_rf = fit_rforest(X, y, title=f"{title_base}_rf", save_dir_plot=save_dir_plot, 
#                     model_args={"n_estimators": 50, "max_depth": 4})
    
#     out_mlp = fit_mlp(X, y, title=f"{title_base}_mlp", save_dir_plot=save_dir_plot, 
#                     model_args={"hidden_layer_size": (10, 12, 8), "max_iter": 1000})
    
#     # SAVE - predict
#     pdf = pd.DataFrame()
#     pdf["Answer"] = y
#     pdf["dt_proba"] = out_dt["predict_proba"]
#     pdf["dt_predict"] = out_dt["predict"]
#     pdf["rf_proba"] = out_rf["predict_proba"]
#     pdf["rf_predict"] = out_rf["predict"]
#     pdf["mlp_proba"] = out_mlp["predict_proba"]
#     pdf["mlp_predict"] = out_mlp["predict"]
#     pdf.to_excel(f"{save_dir}/{title_base}_predicted.xlsx")
#     pdf.to_csv(f"{save_dir}/{title_base}_predicted.csv")
    
#     # SAVE - result
#     results = []
#     results.append(out_dt["result"])
#     results.append(out_rf["result"])
#     results.append(out_mlp["result"])
#     res_df = pd.DataFrame(results)
#     res_df.index = ["Tree", "RForest", "MLP"]
#     res_df.index.name = "Model"
#     res_df.to_excel(f"{save_dir}/{title_base}_score_result.xlsx")
#     res_df.to_csv(f"{save_dir}/{title_base}_score_result.csv")
    
    
#     # SAVE - others (feature_importance)
#     dt_fimp = out_dt["raw"]["feature_importance"]
#     dt_fimp.to_excel(f"{save_dir}/{title_base}_feature_importance_dt.xlsx")
#     dt_fimp.to_csv(f"{save_dir}/{title_base}_feature_importance_dt.csv")
    
#     rf_fimp = out_rf["raw"]["feature_importance"]
#     rf_fimp.to_excel(f"{save_dir}/{title_base}_feature_importance_rforest.xlsx")
#     rf_fimp.to_csv(f"{save_dir}/{title_base}_feature_importance_rforest.csv")
    
#     # break

In [31]:
### 실행용 함수 ###
def run_fit_save(X, y, title_base, save_dir, save_plot_dir):
    """위 for문의 내용을 실행하기 편하게 함수로 단순하게 묶은 것"""
    # 부트스트랩 데이터 준비 (접두사 bt 붙는건 부트스트랩으로 만든 것)
    bootstrap_idx = resample(X.index, replace=True, n_samples=50, random_state=0)
    bt_X = X.loc[bootstrap_idx, :]
    bt_y = y.loc[bootstrap_idx]
    
    # FIT
    # 하이퍼파라미터를 여기서 조정
    out_dt = fit_dt(X, y, bt_X, bt_y, title=f"{title_base}_dt", save_dir_plot=save_dir_plot, 
                    model_args={"max_depth": 3})
    
    out_rf = fit_rforest(X, y, bt_X, bt_y, title=f"{title_base}_rf", save_dir_plot=save_dir_plot, 
                    model_args={"n_estimators": 20, "max_depth": 3})
                    # model_args={"n_estimators": 50, "max_depth": 3})
    
    out_mlp = fit_mlp(X, y, bt_X, bt_y, title=f"{title_base}_mlp", save_dir_plot=save_dir_plot, 
                    model_args={"hidden_layer_size": (9, 10, 8), "max_iter": 10000})
                    # model_args={"hidden_layer_size": (4, 6, 5), "max_iter": 10000})
    
    # SAVE - predict
    pdf = pd.DataFrame()
    pdf["Answer"] = y
    pdf["dt_proba"] = out_dt["predict_proba"]
    pdf["dt_predict"] = out_dt["predict"]
    pdf["rf_proba"] = out_rf["predict_proba"]
    pdf["rf_predict"] = out_rf["predict"]
    pdf["mlp_proba"] = out_mlp["predict_proba"]
    pdf["mlp_predict"] = out_mlp["predict"]
    
    pdf.to_excel(f"{save_dir}/{title_base}_predicted.xlsx")
    pdf.to_csv(f"{save_dir}/{title_base}_predicted.csv")
    
    # SAVE - predict
    bt_pdf = pd.DataFrame()
    bt_pdf["Answer"] = bt_y
    bt_pdf["dt_BT_proba"] = out_dt["BT_predict_proba"]
    bt_pdf["dt_BT_predict"] = out_dt["BT_predict"]
    bt_pdf["rf_BT_proba"] = out_rf["BT_predict_proba"]
    bt_pdf["rf_BT_predict"] = out_rf["BT_predict"]
    bt_pdf["mlp_BT_proba"] = out_mlp["BT_predict_proba"]
    bt_pdf["mlp_BT_predict"] = out_mlp["BT_predict"]
    
    pdf.to_excel(f"{save_dir}/{title_base}_predicted_bootstrap.xlsx")
    pdf.to_csv(f"{save_dir}/{title_base}_predicted_bootstrap.csv")
    
    # SAVE - result
    results = []
    results.append(out_dt["result"])
    results.append(out_rf["result"])
    results.append(out_mlp["result"])
    res_df = pd.DataFrame(results)
    res_df.index = ["Tree", "RForest", "MLP"]
    res_df.index.name = "Model"
    res_df.to_excel(f"{save_dir}/{title_base}_score_result.xlsx")
    res_df.to_csv(f"{save_dir}/{title_base}_score_result.csv")
    
    
    # SAVE - others (feature_importance)
    dt_fimp = out_dt["raw"]["feature_importance"]
    dt_fimp.to_excel(f"{save_dir}/{title_base}_feature_importance_dt.xlsx")
    dt_fimp.to_csv(f"{save_dir}/{title_base}_feature_importance_dt.csv")
    
    rf_fimp = out_rf["raw"]["feature_importance"]
    rf_fimp.to_excel(f"{save_dir}/{title_base}_feature_importance_rforest.xlsx")
    rf_fimp.to_csv(f"{save_dir}/{title_base}_feature_importance_rforest.csv")
    
    return pdf, res_df, out_dt, out_rf, out_mlp

##### RUN

In [32]:
### 전체 데이터
for tr in tqdm(["tr1", "tr2"]):
    # TITLE
    title_base = f"ALL_{tr.upper()}"  # 전체 데이터
    save_dir = "./model_result/All"
    save_dir_plot = "./model_result/All/plot"
    
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir)
    
    if not os.path.isdir(save_dir_plot):
        os.mkdir(save_dir_plot)
    
    
    # DATA
    cols = ['dissim_t1', 'dissim_t2', 'dissim_t3', 'dissim_t4', 'dissim_t5', 'dissim_t6', 'base_rate_diff', 'ko-us_gap_t1', tr]
    temp = df[cols].copy()
    temp = temp.dropna()

    # temp["base_rate_diff"] = temp[["base_rate_diff"]].shift(-1)  # 다음 기의 값 예측
    # temp["base_rate_diff"] = temp["base_rate_diff"].astype(int)
    
    # 데이터 저장
    temp.to_excel(f"{save_dir}/DATA_{title_base}.xlsx")
    temp.to_csv(f"{save_dir}/DATA_{title_base}.csv")

    # X, y
    X = temp.drop(["base_rate_diff"], axis=1)
    y = temp["base_rate_diff"]
    
    # RUN
    pdf, res_df, out_dt, out_rf, out_mlp = run_fit_save(X, y, title_base, save_dir, save_dir_plot)

100%|██████████| 2/2 [00:05<00:00,  2.64s/it]


In [33]:
### 총재별
governors = df["governor"].unique()

for gov in governors:
    for tr in ["tr1", "tr2"]:
        # TITLE
        title_base = f"{gov}_{tr.upper()}"  # 총재별 데이터
        save_dir = f"./model_result/{gov}"
        save_dir_plot = f"./model_result/{gov}/plot"
        
        if not os.path.isdir(save_dir):
            os.mkdir(save_dir)

        if not os.path.isdir(save_dir_plot):
            os.mkdir(save_dir_plot)
        
        print(title_base)

        # DATA
        cols = ['dissim_t1', 'dissim_t2', 'dissim_t3', 'dissim_t4', 'dissim_t5', 'dissim_t6', 'base_rate_diff', 'ko-us_gap_t1',
                tr, "governor"]
        temp = df[cols].copy()

        temp["base_rate_diff"] = temp[["base_rate_diff"]].shift(-1)  # 다음 기의 값 예측
        temp = temp.dropna()
        temp["base_rate_diff"] = temp["base_rate_diff"].astype(int)
        
        # 총재 데이터만 뽑고 총재 삭제
        temp = temp.loc[temp["governor"] == gov, :]
        temp.drop("governor", axis=1, inplace=True)
        
        # 데이터 저장
        temp.to_excel(f"{save_dir}/DATA_{title_base}.xlsx")
        temp.to_csv(f"{save_dir}/DATA_{title_base}.csv")
        
        # x, y
        X = temp.drop(["base_rate_diff"], axis=1)
        y = temp["base_rate_diff"]
        
        # 데이터가 있을 경우에만 실행
        if len(X) > 0:
            
            # RUN
            pdf, res_df, out_dt, out_rf, out_mlp = run_fit_save(X, y, title_base, save_dir, save_dir_plot)
            print("Size:", len(X))
            print("Ratio:", y.mean())
            print("DT:", out_dt["result"])
            print("RF:", out_rf["result"])
            print("MLP:", out_mlp["result"])
        
        else:
            print("NO DATA")
        print("-" * 50)
            

박승_TR1
Size: 48
Ratio: 0.16666666666666666
DT: {'ACC': 0.9166666666666666, 'ROC_AUC': 0.8875000000000001, 'BT_ACC': 0.94, 'BT_ROC_AUC': 0.8125}
RF: {'ACC': 0.8958333333333334, 'ROC_AUC': 0.946875, 'BT_ACC': 0.92, 'BT_ROC_AUC': 0.891304347826087}
MLP: {'ACC': 0.8333333333333334, 'ROC_AUC': 0.625, 'BT_ACC': 0.92, 'BT_ROC_AUC': 0.6793478260869565}
--------------------------------------------------
박승_TR2
Size: 42
Ratio: 0.16666666666666666
DT: {'ACC': 0.9285714285714286, 'ROC_AUC': 0.9081632653061225, 'BT_ACC': 0.88, 'BT_ROC_AUC': 0.9313929313929313}
RF: {'ACC': 0.9047619047619048, 'ROC_AUC': 1.0, 'BT_ACC': 0.86, 'BT_ROC_AUC': 1.0}
MLP: {'ACC': 1.0, 'ROC_AUC': 1.0, 'BT_ACC': 1.0, 'BT_ROC_AUC': 1.0}
--------------------------------------------------
이성태_TR1
Size: 48
Ratio: 0.20833333333333334
DT: {'ACC': 0.9583333333333334, 'ROC_AUC': 0.9513157894736842, 'BT_ACC': 0.96, 'BT_ROC_AUC': 0.9974999999999999}
RF: {'ACC': 0.8958333333333334, 'ROC_AUC': 1.0, 'BT_ACC': 0.9, 'BT_ROC_AUC': 1.0}
MLP: 

In [34]:
print(len(X.columns))
X.columns

8


Index(['dissim_t1', 'dissim_t2', 'dissim_t3', 'dissim_t4', 'dissim_t5',
       'dissim_t6', 'ko-us_gap_t1', 'tr2'],
      dtype='object')

In [35]:
!zip -r model_result.zip ./model_result

  adding: model_result/ (stored 0%)
  adding: model_result/All/ (stored 0%)
  adding: model_result/All/.ipynb_checkpoints/ (stored 0%)
  adding: model_result/All/.ipynb_checkpoints/ALL_TR1_feature_importance_dt-checkpoint.csv (deflated 37%)
  adding: model_result/All/.ipynb_checkpoints/ALL_TR1_score_result-checkpoint.csv (deflated 36%)
  adding: model_result/All/ALL_TR1_feature_importance_dt.csv (deflated 38%)
  adding: model_result/All/ALL_TR1_feature_importance_dt.xlsx (deflated 11%)
  adding: model_result/All/ALL_TR1_feature_importance_rforest.csv (deflated 40%)
  adding: model_result/All/ALL_TR1_feature_importance_rforest.xlsx (deflated 11%)
  adding: model_result/All/ALL_TR1_predicted.csv (deflated 69%)
  adding: model_result/All/ALL_TR1_predicted.xlsx (deflated 8%)
  adding: model_result/All/ALL_TR1_predicted_bootstrap.csv (deflated 69%)
  adding: model_result/All/ALL_TR1_predicted_bootstrap.xlsx (deflated 8%)
  adding: model_result/All/ALL_TR1_score_result.csv (deflated 41%)
  a