In [1]:
import pandas as pd 

In [2]:
train = pd.read_csv('./homecredit-data/train.csv')
test = pd.read_csv('./homecredit-data/test.csv')
lb = pd.read_csv('./homecredit-data/loan_before.csv')

# 1. 문제정의

<strong>질문 1</strong> - 어떤 요소가 대출금 상환 여부에 큰 영향을 줄까? <br>
<strong>질문 2</strong> - 그 요소들이 상환여부에 어떤 영향을 줄까?

# 2. 방법론

## 2.1 분석과정

이 질문에 대한 해답을 얻기 위해서 해석가능한 머신러닝(xAI)을 사용하였다.

<strong>프로세스</strong>

1단계 : feature engineering을 한다. <br>
2단계 : 머신러닝 모델을 만든다. <br>
3단계 : shap value를 통해 어떤 변수가 가장 큰 영향을 주는지 알아본다. <br>
4단계 : 영향을 많이 주는 5개의 변수와 대출금 상환 여부와의 관계를 살펴본다. <br>

## 2.2 feature engineering

AMT_CREDIT에서 AMT_ANNUITY를 나누어 해당 사람이 몇개월에 걸쳐 돈을 갚아야 하는지 변수화

In [3]:
train['AMT_CREDIT_TO_ANNUITY_RATIO'] = train['AMT_CREDIT'] / train['AMT_ANNUITY']
test['AMT_CREDIT_TO_ANNUITY_RATIO'] = test['AMT_CREDIT'] / test['AMT_ANNUITY']

lb(이전에 대출한 정보가 기록된 데이터)를 활용해 변수 생성 시도
<br>

1. groupby 후 평균(사람 id로 묶은 후 각 변수를 평균을 낸다.) <br>
<ul>
    <li>AMT_CREDIT_SUM(이전 대출의 금액) </li>
    <li>DAYS_CREDIT(train, test로 부터 며칠 전에 이전 대출을 진행했는지) </li>
    <li>CNT_CREDIT_PROLONG(대출연장을 몇 번 했는지) </li>
</ul>

2. groupby 후 갯수(사람 id별로의 빈도수, 해당 사람이 이전에 대출을 몇 번 진행했는지)

In [4]:
train = pd.merge(train, lb.groupby(['SK_ID_CURR'])['AMT_CREDIT_SUM'].mean().reset_index(), on='SK_ID_CURR', how='left')
test = pd.merge(test, lb.groupby(['SK_ID_CURR'])['AMT_CREDIT_SUM'].mean().reset_index(), on='SK_ID_CURR', how='left')

In [5]:
train = pd.merge(train, lb.groupby(['SK_ID_CURR'])['DAYS_CREDIT'].mean().reset_index(), on='SK_ID_CURR', how='left')
test = pd.merge(test, lb.groupby(['SK_ID_CURR'])['DAYS_CREDIT'].mean().reset_index(), on='SK_ID_CURR', how='left')

train = pd.merge(train, lb.groupby(['SK_ID_CURR'])['CNT_CREDIT_PROLONG'].mean().reset_index(), on='SK_ID_CURR', how='left')
test = pd.merge(test, lb.groupby(['SK_ID_CURR'])['CNT_CREDIT_PROLONG'].mean().reset_index(), on='SK_ID_CURR', how='left')

In [6]:
train = pd.merge(train, lb.groupby(['SK_ID_CURR']).size().reset_index().rename(columns={0:'count'}), on='SK_ID_CURR', how='left')
test = pd.merge(test, lb.groupby(['SK_ID_CURR']).size().reset_index().rename(columns={0:'count'}), on='SK_ID_CURR', how='left')

해당 프로젝트의 목적은 모델 해석이기 때문에, 이에 방해를 주는 변수는 다 삭제한다.

<strong>제거한 변수목록</strong>
<ul>
    <li>CODE_GENDER : 범주형 변수 </li>
    <li>FLAG_OWN_CAR : 범주형 변수 </li>
    <li>NAME_TYPE_SUITE : 범주형 변수 </li>
    <li>NAME_INCOME_TYPE : 범주형 변수 </li>
    <li>NAME_EDUCATION_TYPE : 범주형 변수 </li>
    <li>NAME_HOUSING_TYPE : 범주형 변수 </li>
    <li>EXT_SOURCE_1 : 변수의 의미를 정확히 모름 </li>
    <li>EXT_SOURCE_2 : 변수의 의미를 정확히 모름 </li>
    <li>EXT_SOURCE_3 : 변수의 의미를 정확히 모름 </li>
</ul>

In [7]:
del_list = ['CODE_GENDER', 'FLAG_OWN_CAR', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

In [8]:
train = train.drop(del_list, axis=1)
test = test.drop(del_list, axis=1)

In [9]:
train.columns

Index(['SK_ID_CURR', 'TARGET', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS',
       'HOUR_APPR_PROCESS_START', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'AMT_CREDIT_TO_ANNUITY_RATIO',
       'AMT_CREDIT_SUM', 'DAYS_CREDIT', 'CNT_CREDIT_PROLONG', 'count'],
      dtype='object')

## 2.3 모델링

In [10]:
input_var = ['FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS',
       'HOUR_APPR_PROCESS_START', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'AMT_CREDIT_TO_ANNUITY_RATIO',
       'AMT_CREDIT_SUM', 'DAYS_CREDIT', 'CNT_CREDIT_PROLONG', 'count']

In [12]:
corr = train[input_var].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_ID_PUBLISH,OWN_CAR_AGE,CNT_FAM_MEMBERS,HOUR_APPR_PROCESS_START,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_YEAR,AMT_CREDIT_TO_ANNUITY_RATIO,AMT_CREDIT_SUM,DAYS_CREDIT,CNT_CREDIT_PROLONG,count
FLAG_OWN_REALTY,1.0,0.008244,0.003243,-0.042446,-0.001448,0.010826,-0.11093,-0.015164,0.004217,0.019393,0.014595,-0.10558,0.026066,0.090058,-0.08392,-0.002745,0.000174,-0.00979,0.008414
CNT_CHILDREN,0.008244,1.0,0.029879,0.006465,0.023275,-0.033326,0.332123,0.068807,-0.029581,-0.010951,0.883051,-0.009661,-0.006102,-0.036431,-0.022026,0.035864,0.026285,-0.012065,0.002649
AMT_INCOME_TOTAL,0.003243,0.029879,1.0,0.366717,0.441573,0.185047,0.066875,-0.041696,0.029519,-0.126551,0.029342,0.092505,-0.040823,0.031593,0.077303,0.241929,-0.013266,0.016117,0.116635
AMT_CREDIT,-0.042446,0.006465,0.366717,1.0,0.770938,0.092177,-0.047089,-0.085049,0.000988,-0.111244,0.066847,0.047472,-0.070924,-0.037907,0.656337,0.135435,-0.068411,-0.000384,0.046902
AMT_ANNUITY,-0.001448,0.023275,0.441573,0.770938,1.0,0.127204,0.017106,-0.048381,0.013662,-0.108185,0.073912,0.047113,-0.058709,0.00027,0.111694,0.128144,-0.052613,-0.005724,0.013588
REGION_POPULATION_RELATIVE,0.010826,-0.033326,0.185047,0.092177,0.127204,1.0,-0.023276,0.01387,0.000946,-0.08827,-0.025638,0.18273,-0.051167,0.015725,0.003524,0.077984,-0.010819,0.003701,-0.034289
DAYS_BIRTH,-0.11093,0.332123,0.066875,-0.047089,0.017106,-0.023276,1.0,0.349373,0.266852,-0.015877,0.283729,0.0872,0.080317,-0.073904,-0.097736,0.053205,0.201829,0.028239,-0.063004
DAYS_EMPLOYED,-0.015164,0.068807,-0.041696,-0.085049,-0.048381,0.01387,0.349373,1.0,0.093421,0.014016,0.032736,0.003502,0.144221,0.003833,-0.087176,-0.018131,0.160521,0.000124,-0.148704
DAYS_ID_PUBLISH,0.004217,-0.029581,0.029519,0.000988,0.013662,0.000946,0.266852,0.093421,1.0,0.001141,-0.025311,0.0297,0.089952,-0.039895,-0.02028,0.033384,0.188471,0.022736,-0.100673
OWN_CAR_AGE,0.019393,-0.010951,-0.126551,-0.111244,-0.108185,-0.08827,-0.015877,0.014016,0.001141,1.0,-0.039362,-0.082672,0.013561,-0.025423,-0.059224,-0.094781,-0.006665,0.011323,-0.025711


높은 상관성을 띄는 변수는 아래와 같다.
<ul>
    <li>CNT_FAM_MEMBERS , CNT_CHILDREN</li>
    <li>AMT_CREDIT_TO_ANNUITY_RATIO , AMT_CREDIT</li>
    <li>AMT_ANNUITY , AMT_CREDIT</li>
</ul>

CNT_FAM_MEMBERS와 CNT_CHILDREN 중 타겟값과 높은 결정계수를 가진 변수는 CNT_CHILDREN이므로, CNT_FAM_MEMBERS를 삭제한다.

In [13]:
print(train['CNT_FAM_MEMBERS'].corr(train['TARGET']))
print(train['CNT_CHILDREN'].corr(train['TARGET']))

0.018876651698723716
0.025357359317615746


In [14]:
del train['CNT_FAM_MEMBERS']
del test['CNT_FAM_MEMBERS']

AMT_CREDIT_TO_ANNUITY_RATIO와 AMT_CREDIT 중 타겟값과 높은 결정계수를 가진 변수는 AMT_CREDIT_TO_ANNUITY_RATIO이므로, AMT_CREDIT를 삭제한다.

In [15]:
print(train['AMT_CREDIT_TO_ANNUITY_RATIO'].corr(train['TARGET']))
print(train['AMT_CREDIT'].corr(train['TARGET']))

-0.024740288335190173
-0.0225584308493476


In [16]:
del train['AMT_CREDIT']
del test['AMT_CREDIT']

In [17]:
train.columns

Index(['SK_ID_CURR', 'TARGET', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'REGION_POPULATION_RELATIVE',
       'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
       'HOUR_APPR_PROCESS_START', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'AMT_CREDIT_TO_ANNUITY_RATIO',
       'AMT_CREDIT_SUM', 'DAYS_CREDIT', 'CNT_CREDIT_PROLONG', 'count'],
      dtype='object')

In [18]:
input_var = ['FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'REGION_POPULATION_RELATIVE',
       'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
       'HOUR_APPR_PROCESS_START', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'AMT_CREDIT_TO_ANNUITY_RATIO',
       'AMT_CREDIT_SUM', 'DAYS_CREDIT', 'CNT_CREDIT_PROLONG', 'count']

<strong>xgboost를 활용해 모델링을 진행한다.</strong>

이유 : shap value를 활용하기 위해선 모델이 랜덤포레스트 형태의 tree형 모델이어야 한다. 이 중에서 xgboost가 속도가 빠르면서 높은 성능을 유지하므로 이를 선택

In [19]:
from xgboost import XGBClassifier

In [20]:
model = XGBClassifier(n_estimators=100, learning_rate=0.1)
model.fit(train[input_var],train['TARGET'])





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## 2.4 shap value 활용