 # 신용카드 사용자 연체 예측 AI 경진대회
 - 2021.04.05 ~ 2021.05.24
 - private : 0.67995 (165등, 상위 22%)
 ---
 정형 데이터 대회 참가는 처음이었다.
  처음 코드를 짤 때는 코드공유 게시판의 `최정명`님의 코드를 바탕으로 작성하였고, 이후 내가 원하는 방식으로 변수나, 모델을 수정하며 대회를 참가하였다.   
    
이번 대회를 통해 얻을 수 있었던 것은  
1. 어떤 모델이든 일단 적용해 보고 봐야한다는것. 
    - RF의 성능이 꽤 괜찮았음.
    - 앙상블은 어떤 경우라도 성능 향상이 보장됨.
2. 정형데이터의 경우 Feature engineering가 정말 중요하다는 것.
    - 다른 사람의 코드를 보니, 변수 추가나, 다양한 방식으로 전처리를 진행한 것을 보고 내가 조금 게을렀다고 생각 들었다.  
  
  
  끝으로 결과가 다소 아쉽지만, 여러 가지 배울점이 많았던 대회이고 게을러지지 말아야겠다고 생각 드는 대회였다. 




# 라이브러리 로딩

In [1]:
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
import glob
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [4]:
# occyp_type에만 결측치 존재
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

# Data Preprocessing
- EDA를 통한 데이터 전처리 진행
- 결측치 NaN 값으로 채우기
- 원-핫인코딩 적용

In [5]:
def drop_columns(df, drop_col):
    """
    eda를 통해 확인한 불필요한 변수 삭제

    파라미터 
    ---
    df : DataFrame
        train or test
    drop_col : list
        제거 할 컬럼 명
    returns
    ---
    df : DataFrame
        불필요한 columns이 제거된 Dataframe
    """

    df = df.drop(drop_col, axis=1)
    return df

In [6]:
def drop_dupli(df, mode):
    """
    중복을 제거하는 코드 

    파라미터 
    ---
    df : DataFrame
        train or test
    mode : int
        0 : credit제외 중복 제거
        1 : credit, begin_month제외 중복 제거

    returns 
    ---
    df : DataFrame
        중복이 제거된 dataframe    
    """
    assert mode < 2, 'mode는 0과 1중 하나만 입력해주세요'

    if mode == 0:
        df = df.drop_duplicates(['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
                                 'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
                                 'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'occyp_type',
                                 'family_size', 'begin_month'])

    else:
        df = df.drop_duplicates(['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
                                 'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
                                 'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'occyp_type',
                                 'family_size'])

    # index 초기화
    df.reset_index(drop=True, inplace=True)
    return df

In [7]:
def change_numeric(df):
    """
    수치형 데이터 중 음수를 양수로 변환 시켜주고,
    일별 데이터를 년도별 데이터로 변환

    파라미터
    ---
    df : DataFrame
        train or test

    returns
    ---
    df : DataFrame
        수치형 데이터가 변환된 dataframe
    """
    df['DAYS_BIRTH'] = -1 * df['DAYS_BIRTH'] / 365
    df['DAYS_EMPLOYED'] = -1 * df['DAYS_EMPLOYED'] / 365
    df['begin_month'] = -1 * df['begin_month']

    return df

In [8]:
from sklearn.preprocessing import StandardScaler


def scaling(df, mode):
    """
    중복을 제거하는 코드 

    파라미터 
    ---
    df : DataFrame
        train or test
    mode : int
        0 : log 변환 적용
        1 : standard scaling 적용

    returns 
    ---
    df : DataFrame
        scaling이 적용된 dataframe    
    """
    assert mode < 2, 'mode는 0과 1중 하나만 입력해주세요'

    # 수치형 데이터중 변수 중요도가 높은 몇개의 변수만 변환
    columns = ['income_total', 'DAYS_EMPLOYED', 'DAYS_BIRTH']

    if mode == 0:
        # 로그 변환
        for col in columns:
            train[col] = np.log1p(train[col])
            test[col] = np.log1p(test[col])

    else:
        standardScaler = StandardScaler()
        standardScaler = standardScaler.fit(train[columns])

        train[columns] = standardScaler.transform(train[columns])
        # test 셋에도 동일 하게 적용.
        test[columns] = standardScaler.transform(test[columns])

    return df

In [9]:
def change_boolen_type(df):
    """
    phone, mobile 등 소지 여부를 나타내는 변수가 
    int 형으로 입력 되어있었기 때문에 카테고리화 적용.

    파라미터
    ---
    df : DataFrame
        train or test

    returns
    ---
    df : DataFrame
        수치형 데이터가 변환된 dataframe
    """
    columns = ['FLAG_MOBIL', 'work_phone', 'phone', 'email']
    df[columns] = df[columns].astype('category')
    return df

In [10]:
def drop_outlier(df, column):
    """
    변수 중요도가 높은 것을 우선순위로 이상치 제거


    파라미터
    ---
    df : DataFrame
        train or test
    columns : list
        이상치를 제거할 변수가 담겨있는 list
    returns
    ---
    df : DataFrame
        수치형 데이터가 변환된 dataframe
    """

    for col in column:
        df_ = df[col]
        # 1분위수
        quan_25 = np.percentile(df_.values, 25)

        # 3분위수
        quan_75 = np.percentile(df_.values, 75)

        iqr = quan_75 - quan_25

        lowest = quan_25 - iqr * 1.5
        highest = quan_75 + iqr * 1.5
        outlier_index = df_[(df_ < lowest) | (df_ > highest)].index
        df.drop(outlier_index, axis=0, inplace=True)

    return df

In [11]:
def data_preprocessing(df, drop_col, duplicate_mode, scaling_mode, drop_outlier_columns, data='None'):
    """
    위에서 정의한 전처리 함수를 한번에 적용하는 함수

    파라미터 
    ---
    df : DataFrame
        train or test

    drop_col : list
        drop 시킬 columns

    duplicate_mode : int
        0 : credit제외 중복 제거
        1 : credit, begin_month제외 중복 제거

    scaling_mode : int 
        0 : log 변환 적용
        1 : standard scaling 적용

    drop_outlier_columns: list
        이상치를 제거할 변수가 담겨있는 list

    data : string
        해당 값이 cat 이면 catboost용 데이터를 생성 하기 위해 원핫 인코딩 미적용

    returns 
    ---
    df : DataFrame
        전처리가 적용된 dataframe

    """
    df = drop_columns(df, drop_col)

    # df = drop_dupli(df, duplicate_mode)

    df = change_numeric(df)
    df = scaling(df, scaling_mode)
    # 결측치 처리
    df.fillna('NaN', inplace=True)
    df = change_boolen_type(df)

    #df = drop_outlier(df, drop_outlier_columns)

    # 원핫인코딩
    if data == 'cat':
        return df
    df = pd.get_dummies(df)
    return df

In [12]:
# 삭제할 컬럼명을 저장할 list
drop_col = ['index', 'child_num']
DUPLICATE_MODE = 0
SCALING_MODE = 1
drop_outlier_columns = ['income_total', 'DAYS_EMPLOYED', 'family_size']

train = data_preprocessing(train, drop_col=drop_col, duplicate_mode=DUPLICATE_MODE, scaling_mode=SCALING_MODE,
                           drop_outlier_columns=drop_outlier_columns)
test = data_preprocessing(test, drop_col=drop_col, duplicate_mode=DUPLICATE_MODE, scaling_mode=SCALING_MODE,
                          drop_outlier_columns=drop_outlier_columns)

# 모델링
- 모델은 `Lightgbm`,`catboost`,`RandomForest`총 3개를 앙상블하여 결과값 도출
- lgbm 모델만 optuna 라이브러리로 최적화 진행
- 모델 검증 방법은 stritfied KFold 사용 (k = 10)
- 각 모델 10개를 훈련하여 저장.


## LGBM 튜닝

In [14]:
from lightgbm import LGBMClassifier
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [15]:
# 모델링 전 학습 타겟 label 분리
train_x = train.drop('credit', axis=1)
train_y = train['credit']

In [16]:
def objective(trial: Trial) -> float:
    """
    optuna 라이브러리를 활용하여 모델 튜닝시 사용되는 함수.
    """
    params_lgb = {
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.1),
        "n_estimators": 500,
        "objective": "multiclass",
        "metric": "multi_logloss",
        "max_depth": trial.suggest_int("max_depth", 1, 30),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 0.6),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 2, 30),
    }

    X_train, X_valid, y_train, y_valid = train_test_split(
        train_x, train_y, test_size=0.2, stratify=train_y)

    model = LGBMClassifier(**params_lgb)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=50,
        verbose=False,
    )

    lgb_pred = model.predict_proba(X_valid)
    log_score = log_loss(y_valid, lgb_pred)

    return log_score

In [17]:
# optna를 이용해 하이퍼 파라미터 튜닝할 준비를 해줍니다.
sampler = TPESampler(seed=42)
lgbm_study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)

[32m[I 2022-02-21 17:06:47,185][0m A new study created in memory with name: lgbm_parameter_opt[0m


In [44]:
# n_trais = 20 으로 설정하여 lgbm 모델을 튜닝
lgbm_study.optimize(objective, n_trials=20, show_progress_bar=True,)
print("Best Score:", lgbm_study.best_value)
print("Best trial:", lgbm_study.best_trial.params)

Best Score: 0.7065383925084582
Best trial: {'learning_rate': 0.07568316785311044, 'max_depth': 17, 'num_leaves': 249, 'colsample_bytree': 0.3387403565626488, 'subsample': 0.7717804630243458, 'subsample_freq': 5, 'min_child_samples': 11}


In [19]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = []
for train_idx, valid_idx in skf.split(train_x, train_y):
    folds.append((train_idx, valid_idx))

In [32]:
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
def training(model_name):
    """
    모델이름을 입력받아 10-fold로 학습된 모델 리스트를 반환하는 함수

    파라미터
    ---
    model_name : LGBM or RF or CAT
        학습 시킬 모델
    returns
    ---
    models : 
        학습 된 10개 모델
    """
    random.seed(42)
    models = {}
    for fold in range(10):
        print(
            f'===================================={fold+1}============================================')
        train_idx, valid_idx = folds[fold]
        X_train, y_train = train_x.iloc[train_idx], train_y.iloc[train_idx]
        X_valid, y_valid = train_x.iloc[valid_idx], train_y.iloc[valid_idx]

        if model_name == 'RF':
            model = RandomForestClassifier(
                random_state=42, n_estimators=1000, verbose=False)
            model.fit(X_train, y_train)

        elif model_name == 'LGBM':
            model = LGBMClassifier(**lgbm_study.best_trial.params)
            model.fit(X_train, y_train,
                    eval_set=[(X_train, y_train), (X_valid, y_valid)],
                    early_stopping_rounds=100, eval_metric='logloss',
                    verbose=100)

        elif model_name == 'CAT':
            model = CatBoostClassifier(random_state=42, n_estimators=2000)
            model.fit(X_train, y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      early_stopping_rounds=50,
                      verbose=100)

        models[fold] = model

        print(f'================================================================================\n\n')
    return models

In [33]:
model_names = ['RF','LGBM','CAT']

for model_name in model_names : 
    models = training(model_name)
    # 학습 된 모델로 test inference 
    for fold in range(10):
        submit.iloc[:, 1:] += models[fold].predict_proba(test)/30    





















Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.451697	valid_1's multi_logloss: 0.696987
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.451697	valid_1's multi_logloss: 0.696987


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.449103	valid_1's multi_logloss: 0.706522
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.449103	valid_1's multi_logloss: 0.706522


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.449813	valid_1's multi_logloss: 0.70811
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.449813	valid_1's multi_logloss: 0.70811


Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.448239	valid_1's multi_logloss: 0.723238
Did not meet early stopping. Best iteration is:
[100]	train

In [36]:
submit.head(20)

Unnamed: 0,index,0,1,2
0,26457,0.112692,0.244362,0.976278
1,26458,0.26008,0.198007,0.875246
2,26459,0.110193,0.133125,1.090016
3,26460,0.18483,0.181942,0.966561
4,26461,0.157352,0.292204,0.883777
5,26462,0.087266,0.246333,0.999734
6,26463,0.585735,0.661335,0.086263
7,26464,0.227565,0.160282,0.945487
8,26465,0.091522,0.173047,1.068765
9,26466,0.126698,0.210394,0.996242


In [None]:
# 정답파일 생성
submit.to_csv('final.csv', index=False)