 # 신용카드 사용자 연체 예측 AI 경진대회
 - 2021.04.05 ~ 2021.05.24
 - private : 0.67995 (165등, 상위 22%)
 ---
 정형데이터 대회 참가는 처음이었다.
  처음 코드를 짤 때는 코드공유 게시판의 `최정명`님의 코드를 바탕으로 작성하였고, 이후 내가 원하는 방식으로 변수나, 모델을 수정하며 대회를 참가하였다.   
    
  이번대회를 통해 얻을 수 있었던 것은  
1. 어떤 모델이든 일단 적용해보고 봐야한다는것. 
    - RF의 성능이 꽤 괜찮았음.
    - 앙상블은 어떤 경우라도 성능향상이 ...
2. 정형데이터의 경우 Feature engineering가 정말 중요하다는 것.
    - 다른 사람의 코드를 보니, 변수 추가나, 다양한 방식으로 전처리를 진행 한것을보고 내가 조금 게을렀다고 생각들었다.  
  
  
  끝으로 결과가 다소 아쉽지만, 여러가지 배울점이 많았던 대회이고 게을러지지 말아야 겠다고 생각 드는 대회였다. 




# Library

In [None]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random

# Data Load & Preprocessing
- 훈련에 필요없는 index 컬럼 삭제.
- missing value를 모두 NAN 문자열로 대체
- dtype object 인 컬럼들을 onehot encoding

In [None]:
train = pd.read_csv('train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True)

test = pd.read_csv('test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

submit = pd.read_csv('sample_submission.csv')

## 중복 제거
---
- credit 제외 중복제거
- credit, begin_month 제외 중복제거

위 두가지를 모두 적용해 보았지만, 제거되는 행 갯수가 너무많아 오히려 과적합이 심하게 발생됨.   

따라서 중복 제거는 따로 적용해주지 않음.

In [None]:
# credit 제외 중복제거(적용 x )
'''

train = train.drop_duplicates(['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'occyp_type',
       'family_size','begin_month'])
train.reset_index(drop= True,inplace=  True)'''

In [None]:
# credit,begin_month 제외 중복제거 (적용 x)
'''
train = train.drop_duplicates(['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'occyp_type',
       'family_size'])

train.reset_index(drop= True,inplace=  True)

'''

## 필요없는 변수 제거
---
- occyp_type 의 경우 결측치가 많아 변수 자체를 제거해 보았지만 성능이 오히려 안좋게 나옴
- child_num 의경우 family_size와 상관관계가 뚜렷했기때문에 제거해줌.

In [None]:
# occyp_type 제거 코드 (적용 x)
'''
train.drop('occyp_type',axis= 1 ,inplace = True)
test.drop('occyp_type',axis= 1 ,inplace = True)
'''

In [None]:
train.drop(columns = 'child_num', inplace=True)
test.drop(columns = 'child_num', inplace=True)

## 수치형 데이터중 음수를 양수로 변환 시켜주고, 일별 데이터를 년도별 데이터로 변환
---
- 코드 공유를 통해 얻은 전처리 방법.

In [None]:
train['DAYS_BIRTH'] = -1 * train['DAYS_BIRTH']  /365
train['DAYS_EMPLOYED'] = -1 * train['DAYS_EMPLOYED']   / 365

train['begin_month'] = -1 * train['begin_month']  

test['DAYS_BIRTH'] = -1 * test['DAYS_BIRTH']  /365
test['DAYS_EMPLOYED'] = -1 * test['DAYS_EMPLOYED'] / 365

test['begin_month'] = -1 * test['begin_month'] 

## log 변환 
---
- 적용하지 않음
- 수치형 데이터의 음수가 존재 했기때문에, 애매하여 log 변환을 적용하지 않고 정규화 시켜줌.

In [None]:
# 적용 x 
'''
candidate = ['income_total', 'DAYS_EMPLOYED', 'DAYS_BIRTH', 'begin_month']

for cand in candidate :
    train[cand] = np.log1p(train[cand])
    test[cand] = np.log1p(test[cand])
    '''

## 데이터 정규화
---
- 수치형 데이터중 변수 중요도가 높은 몇개의 변수만 정규화.


In [None]:
train.describe()

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,credit
count,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0,8759.0
mean,0.423907,181887.2,-15975.366594,60798.542642,1.0,0.218176,0.29033,0.08928,2.183012,-25.403813,1.546409
std,0.769883,100449.2,4241.781747,138862.582193,0.0,0.413031,0.453941,0.285163,0.935804,16.646729,0.69521
min,0.0,27000.0,-25152.0,-15713.0,1.0,0.0,0.0,0.0,1.0,-60.0,0.0
25%,0.0,112500.0,-19512.5,-3011.5,1.0,0.0,0.0,0.0,2.0,-39.0,1.0
50%,0.0,157500.0,-15607.0,-1379.0,1.0,0.0,0.0,0.0,2.0,-23.0,2.0
75%,1.0,225000.0,-12407.0,-348.0,1.0,0.0,1.0,0.0,3.0,-11.0,2.0
max,19.0,1575000.0,-7705.0,365243.0,1.0,1.0,1.0,1.0,20.0,0.0,2.0


In [None]:
# columns = ['income_total',	'DAYS_BIRTH',	'DAYS_EMPLOYED',	'FLAG_MOBIL',	'work_phone',	'phone',	'email',	'family_size',	'begin_month']
columns = ['income_total','DAYS_BIRTH','DAYS_EMPLOYED']

In [None]:
from sklearn.preprocessing import *

standardScaler = StandardScaler()
standardScaler = standardScaler.fit(train[columns])

train[columns]= standardScaler.transform(train[columns])
# test 셋에도 동일 하게 적용.
test[columns]=standardScaler.transform(test[columns])

In [None]:
train.head()

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,F,N,N,0.149136,Commercial associate,Higher education,Married,Municipal apartment,-0.490075,0.46393,0.0,-0.538417,-0.645705,-0.316937,NAN,-0.214735,-1.215231,1.0
1,F,N,Y,0.590848,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-1.089621,0.440878,0.0,-0.538417,-0.645705,3.155199,Laborers,0.876135,-1.27562,1.0
2,M,Y,Y,2.57855,Working,Higher education,Married,House / apartment,0.744719,0.461929,0.0,-0.538417,1.548696,-0.316937,Managers,-0.214735,-0.249003,2.0
3,F,N,Y,0.149136,Commercial associate,Secondary / secondary special,Married,House / apartment,-0.207081,0.444893,0.0,-0.538417,1.548696,-0.316937,Sales staff,-0.214735,0.656836,0.0
4,F,Y,Y,-0.292575,State servant,Higher education,Married,House / apartment,-0.21922,0.444988,0.0,-0.538417,-0.645705,-0.316937,Managers,-0.214735,-0.007446,2.0


## bool 형 데이터 변환
---
- phone, mobile 등 소지 여부를 나타내는 변수가 int 형으로 입력 되어있었기 때문에, 카테고리화 적용.

In [None]:
columns  = ['FLAG_MOBIL','work_phone','phone','email']
train[columns] = train[columns].astype('category')
test[columns] = test[columns].astype('category')

## 이상치 제거
---
- 변수 중요도가 높은 것을 기준으로 이상치를 제거 해보았지만, 어떻게 제거를 하든지, 성능이 않좋아 져서 적용하지 않음.

In [None]:
# 이상치 제거 (적용 x)
'''
def remove_outlier(train,column):
    df = train[column]
    # 1분위수
    quan_25 = np.percentile(df.values, 25)
    
    # 3분위수
    quan_75 = np.percentile(df.values, 75)
    
    iqr = quan_75 - quan_25
    
    lowest = quan_25 - iqr * 1.5
    highest = quan_75 + iqr * 1.5
    outlier_index = df[(df < lowest) | (df > highest)].index
    print('outlier의 수 : ' , len(outlier_index))
    train.drop(outlier_index, axis = 0, inplace = True)
    
    return train

candidate = ['income_total', 'DAYS_EMPLOYED', 'family_size']
for cand in candidate:  
    train = remove_outlier(train,cand)

train.reset_index(drop = True,inplace = True)
len(train)
'''

outlier의 수 :  427
outlier의 수 :  0
outlier의 수 :  59


25971

## 원핫인코딩

In [None]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# 모델링
- 데이터 분리는 StratifiedKFold 를 사용하여 y값 분포를 비슷하게 분리시킴. -> 10-fold
- lgbm, catboost, RF, XGBoost 사용
- 각 모델 10개를 훈련하여 저장.
- `최정명`님 코드를 바탕으로 작성.

## bayesian optimization 
---
- bayesian optimization 를 이용하여 하이퍼 파라미터 튜닝
- lgbm만 적용.

In [None]:
# bayesian optimization 패키지 설치
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp37-none-any.whl size=11687 sha256=4b8decbfc92027af9b7ebab211cb70ca16eb25dbad37c85d331e0aa7233678e3
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [None]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

In [None]:
bayesian_params = {
    'max_depth': (5, 20), 
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.01, 50) 
}

def lgb_roc_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        "n_estimators":1000, "learning_rate":0.01,
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0)
    }
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], verbose= 100, 
                early_stopping_rounds= 100)
    y_pred= lgb_model.predict(valid_x)
    #valid_proba = lgb_model.predict_proba(valid_x)[:, 1]
    f1 = f1_score(valid_y,y_pred,average='macro')
    
    
    return f1   

X = train.drop(['credit'], axis = 1)
y = train['credit']

train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2, random_state=42)

# BayesianOptimization객체를 수행할 함수와 search할 parameter 범위를 설정하여 생성. 
lgbBO = BayesianOptimization(lgb_roc_eval,bayesian_params , random_state=0)
# 함수 반환값이 최대가 되는 입력값 유추를 위한 iteration 수행. 
lgbBO.maximize(init_points=5, n_iter=25)

## catboost

In [None]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 55kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1


In [None]:
from catboost import CatBoostClassifier, Pool
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [None]:

random.seed(42)
cat_models={}
sum_log = 0 
for fold in range(10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    model=CatBoostClassifier(random_state=42,n_estimators=2000)

    model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=50,
           verbose=100)
    cat_models[fold]=model
    print(f'================================================================================\n\n')



## RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
random.seed(42)
rf_models={}
sum_log = 0 
for fold in range(10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    model=RandomForestClassifier(random_state=42,n_estimators=1000,verbose = 1)
    model.fit(X_train, y_train)
    rf_models[fold]=model
    print(f'================================================================================\n\n')


## LGBM
---
basian optimization 을 이용하여 얻은 하이퍼 파라미터를 이용.

In [None]:
random.seed(42)
lgb_models={}
sum_log = 0 
for fold in range(10):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=2000,
                         max_depth=20,
                         
                         min_child_samples=25,
                         learning_rate = 0.1
                         )
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=100,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')



# Test inference
- 각 fold를 훈련시킨 LGBM,RF,Catboost로 predict.
- 해당 대회는 logloss score를 겨루는 것이기 때문에 각 class의 probability를 얻어야함.
- 대부분의 머신러닝 모델에서 predict, predict_proba를 구분하여 사용함.
- predict는 class 출력을 해주고 predict_proba는 class별 probability를 출력해줌.
- predict_proba를 사용하여 예측한 것을 5-fold 더하여 평균내어 앙상블.

In [None]:
submit = pd.read_csv('sample_submission.csv')

In [None]:
submit.iloc[:,1:]=0
for fold in range(10):
    submit.iloc[:,1:] += cat_models[fold].predict_proba(test)/30

In [None]:
for fold in range(10):
    submit.iloc[:,1:] += rf_models[fold].predict_proba(test)/30

In [None]:
for fold in range(10):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/30

In [None]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.056566,0.076849,0.866586
1,26458,0.184885,0.171722,0.643393
2,26459,0.045312,0.137334,0.817354
3,26460,0.116267,0.130359,0.753374
4,26461,0.102403,0.231974,0.665623
5,26462,0.094298,0.129665,0.776037
6,26463,0.546128,0.453802,7e-05
7,26464,0.0949,0.117142,0.787958
8,26465,0.058308,0.09245,0.849242
9,26466,0.0461,0.309113,0.644787


In [None]:
submit.to_csv('final.csv', index=False) 