In [1]:
!pip install optuna





In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings, random
warnings.filterwarnings(action='ignore')

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool

In [None]:
# 데이터 불러오기

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
# 데이터 전처리
# 1. 결측치 처리

train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True)

In [None]:
# 데이터 전처리
# 2. 이상치 처리
# train['family_size'] > 7 인 데이터 제거

train = train[(train['family_size'] <= 7)]
train = train.reset_index(drop=True)

In [None]:
# Feature Engineering
# 1. 의미없는 변수 제거
# index 제거
# FLAG_MOBIL 삭제:모든 값이 1로 동일

train.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)

In [None]:
# Feature Engineering
# 2. DAYS_EMPLOYED
# 양수인 데이터는 현재 무직자로 판단, 0 처리

train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].map(lambda x: 0 if x > 0 else x)

In [None]:
# Feature Engineering
# 3. DAYS_BIRTH, begin_month, DAYS_EMPLOYED
# 음수값 -> 양수 변환

feats = ['DAYS_BIRTH', 'begin_month', 'DAYS_EMPLOYED']
for feat in feats:
    train[feat]=np.abs(train[feat])

In [None]:
# Feature Engineering
# 4. 파생변수
# numeric 변수는 최대한 다양한 특징을 보일 수 있도록 생성
# category 변수는 여러가지를 조합해 보았지만 전체 변수를 합친 ID 하나만 만들었을때 가장 logloss가 낮음


for df in [train, test]:
    # before_EMPLOYED: 고용되기 전까지의 일수
    df['before_EMPLOYED'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
    df['income_total_befofeEMP_ratio'] = df['income_total'] / df['before_EMPLOYED']
    df['before_EMPLOYED_m'] = np.floor(df['before_EMPLOYED'] / 30) - ((np.floor(df['before_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['before_EMPLOYED_w'] = np.floor(df['before_EMPLOYED'] / 7) - ((np.floor(df['before_EMPLOYED'] / 7) / 4).astype(int) * 4)
    
    #DAYS_BIRTH 파생변수- Age(나이), 태어난 월, 태어난 주(출생연도의 n주차)
    df['Age'] = df['DAYS_BIRTH'] // 365
    df['DAYS_BIRTH_m'] = np.floor(df['DAYS_BIRTH'] / 30) - ((np.floor(df['DAYS_BIRTH'] / 30) / 12).astype(int) * 12)
    df['DAYS_BIRTH_w'] = np.floor(df['DAYS_BIRTH'] / 7) - ((np.floor(df['DAYS_BIRTH'] / 7) / 4).astype(int) * 4)

    
    #DAYS_EMPLOYED_m 파생변수- EMPLOYED(근속연수), DAYS_EMPLOYED_m(고용된 달) ,DAYS_EMPLOYED_w(고용된 주(고용연도의 n주차))  
    df['EMPLOYED'] = df['DAYS_EMPLOYED'] // 365
    df['DAYS_EMPLOYED_m'] = np.floor(df['DAYS_EMPLOYED'] / 30) - ((np.floor(df['DAYS_EMPLOYED'] / 30) / 12).astype(int) * 12)
    df['DAYS_EMPLOYED_w'] = np.floor(df['DAYS_EMPLOYED'] / 7) - ((np.floor(df['DAYS_EMPLOYED'] / 7) / 4).astype(int) * 4)

    #ability: 소득/(살아온 일수+ 근무일수)
    df['ability'] = df['income_total'] / (df['DAYS_BIRTH'] + df['DAYS_EMPLOYED'])
    
    #income_mean: 소득/ 가족 수
    df['income_mean'] = df['income_total'] / df['family_size']
    
    #ID 생성: 각 컬럼의 값들을 더해서 고유한 사람을 파악(*한 사람이 여러 개 카드를 만들 가능성을 고려해 begin_month는 제외함)
    df['ID'] = \
    df['child_num'].astype(str) + '_' + df['income_total'].astype(str) + '_' +\
    df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' +\
    df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' +\
    df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' +\
    df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' +\
    df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' +\
    df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) + '_' +\
    df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str)

In [None]:
# Feature Engineering
# 5. 파생변수와 다중공선을 보이는 컬럼 삭제

cols = ['child_num', 'DAYS_BIRTH', 'DAYS_EMPLOYED',]
train.drop(cols, axis=1, inplace=True)
test.drop(cols, axis=1, inplace=True)

In [None]:
# Scaling, Encoding
# 1. Numeric, Category 컬럼 분류

numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('credit')

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()

print("Number of Numerical features: ", len(numerical_feats))
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  18
Number of Categorical features:  9


In [None]:
# Scaling, Encoding
# 2. Log Scale + 개별 feature 또는 target의 분포가 왜곡이 심할 경우 로그변환
# income_total

for df in [train,test]:
    df['income_total'] = np.log1p(1+df['income_total'])

In [None]:
# Scaling, Encoding
# 5. StandardScale
# 이미 로그변환을 진행한 income_total을 제외한 나머지 numeric 컬럼 정규화

numerical_feats.remove('income_total')
scaler = StandardScaler()
train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
test[numerical_feats] = scaler.transform(test[numerical_feats])

In [None]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

object_col2 = []
for col2 in test.columns:
    if test[col2].dtype == 'object':
        object_col2.append(col2)

In [None]:
object_col, object_col2

(['gender',
  'car',
  'reality',
  'income_type',
  'edu_type',
  'family_type',
  'house_type',
  'occyp_type',
  'ID'],
 ['gender',
  'car',
  'reality',
  'income_type',
  'edu_type',
  'family_type',
  'house_type',
  'occyp_type',
  'ID'])

In [None]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])

train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [None]:
enc2 = OneHotEncoder()
enc2.fit(test.loc[:,object_col2])

test_onehot_df = pd.DataFrame(enc2.transform(test.loc[:,object_col2]).toarray(), 
             columns=enc2.get_feature_names(object_col2))
test.drop(object_col2, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [None]:
train.shape, test.shape

((26451, 8821), (10000, 5649))

In [None]:
X = train
y = train['credit']
X_test = test.copy()

In [None]:
# 데이터 분리는 StratifiedKFold 를 사용하여 y값 분포를 비슷하게 분리시킴. -> 5-fold
# lightgbm의 default parameter로 훈련.
# 30번 이상 개선 없을 경우 중단.
# 각 5개의 fold를 훈련하여 저장
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')


Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.646294	valid_1's multi_logloss: 0.748779
[200]	training's multi_logloss: 0.563033	valid_1's multi_logloss: 0.72888
[300]	training's multi_logloss: 0.500858	valid_1's multi_logloss: 0.721748
[400]	training's multi_logloss: 0.449955	valid_1's multi_logloss: 0.719547
Early stopping, best iteration is:
[390]	training's multi_logloss: 0.454545	valid_1's multi_logloss: 0.719269


Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.64864	valid_1's multi_logloss: 0.749833
[200]	training's multi_logloss: 0.56114	valid_1's multi_logloss: 0.728638
[300]	training's multi_logloss: 0.497265	valid_1's multi_logloss: 0.720354
[400]	training's multi_logloss: 0.448083	valid_1's multi_logloss: 0.718603
Early stopping, best iteration is:
[387]	training's multi_logloss: 0.453519	valid_1's multi_logloss: 0.718182


Training until validation scores don't improve for 30 r

In [None]:
#optuna 활용한 최적 하이퍼파라미터 서치
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split

def objective(trial: Trial) -> float:
    params_lgb = {
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "objective": "multiclass",
        "metric": "multi_logloss",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

    model = LGBMClassifier(**params_lgb)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=100,
        verbose=False,
    )

    lgb_pred = model.predict_proba(X_valid)
    log_score = log_loss(y_valid, lgb_pred)
    
    return log_score

In [None]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2021-08-31 08:40:55,039][0m A new study created in memory with name: lgbm_parameter_opt[0m
[32m[I 2021-08-31 08:41:38,908][0m Trial 0 finished with value: 2.3942080329035147e-07 and parameters: {'reg_alpha': 1.12424581642324e-05, 'reg_lambda': 0.08556428806974939, 'max_depth': 15, 'num_leaves': 154, 'colsample_bytree': 0.4936111842654619, 'subsample': 0.40919616423534183, 'subsample_freq': 1, 'min_child_samples': 88, 'max_bin': 380}. Best is trial 0 with value: 2.3942080329035147e-07.[0m
[32m[I 2021-08-31 08:41:59,011][0m Trial 1 finished with value: 3.208155335351996e-07 and parameters: {'reg_alpha': 2.1245096608103405e-05, 'reg_lambda': 0.0018526142807772773, 'max_depth': 20, 'num_leaves': 214, 'colsample_bytree': 0.5274034664069657, 'subsample': 0.42727747704497043, 'subsample_freq': 2, 'min_child_samples': 34, 'max_bin': 357}. Best is trial 0 with value: 2.3942080329035147e-07.[0m
[32m[I 2021-08-31 08:42:21,521][0m Trial 2 finished with value: 1.5601276805653721e-

Best Score: 1.0374241295932559e-07
Best trial: {'reg_alpha': 1.8702710823558463e-05, 'reg_lambda': 0.02978082892775818, 'max_depth': 2, 'num_leaves': 81, 'colsample_bytree': 0.5951099932160482, 'subsample': 0.8107243248366449, 'subsample_freq': 7, 'min_child_samples': 90, 'max_bin': 342}


In [None]:
# optuna로 도출한 최적 파라미터 적용
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(reg_alpha= 1.8702710823558463e-05, 
                         reg_lambda= 0.02978082892775818, 
                         max_depth= 2, 
                         num_leaves= 81, 
                         colsample_bytree= 0.5951099932160482, 
                         subsample= 0.8107243248366449, 
                         subsample_freq= 7, 
                         min_child_samples= 90, 
                         max_bin= 342)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.795518	valid_1's multi_logloss: 0.801544
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.795518	valid_1's multi_logloss: 0.801544


Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.794634	valid_1's multi_logloss: 0.806062
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.794634	valid_1's multi_logloss: 0.806062


Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.79494	valid_1's multi_logloss: 0.803668
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.79494	valid_1's multi_logloss: 0.803668


Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.794436	valid_1's multi_logloss: 0.804607
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: