# Data 처리

1. Read Data
2. 결측치 처리
3. 데이터 형 변환
4. 파생변수 생성

In [1]:
import pandas as pd
import numpy as np
# import pandas_profiling as pp
import gc

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

# category encoding
from category_encoders.target_encoder import TargetEncoder
# from category_encoders.one_hoe import OneHotEncoder

# Utility
import os
import time
import random
import joblib
import re

pd.set_option('display.max_columns', 100)

### 3.0 Load 

In [2]:
with open('../input/01.2.make_feature_4M_shift_data.pkl', 'rb') as f:
    pkl_data = joblib.load(f)
locals().update(pkl_data)

### 데이터 정리

In [3]:
response_test = pd.read_csv(os.path.abspath("../input") + '/response_test.csv')

In [4]:
ID=response_test.ID

In [5]:
# 학습용 데이터 분리를 고려하여 train을 TIME으로 sorting 한다.
train.sort_values(by='TIME', inplace=True)

In [6]:
y_train = train.STATUS

In [7]:
# train+test feature의 train, test 구분을 위한 index 저장
train_idx = train.shape[0]
train_idx

1506055

In [8]:
train.shape, test.shape

((1506055, 89), (1355517, 89))

In [9]:
# 데이터 전처리를 위한 train+test feature 생성
features = pd.concat([train[features_cols],test[features_cols]]).reset_index(drop=True)

In [10]:
del train, test
gc.collect()

74

###  Impute missing values

In [11]:
num_features = ['BIRTH','SQ_R', 'A_R', 'B_R', 'C_R', 'D_R', 'F_R', 'H_R', 'T_R', 'X_R', 'ALL_R',
                'TITLE', 'IR', 'LOI', 'CPI', 'MONTH', 'DAY', 'HOUR', 
                'RES_RATE', 'RES_RATE_3M','TYPE_RES_RATE', # 'LAST_RES_DAYS',
                'TOT_CPI', 'AVG_CPI', 'MIN_CPI', 'MAX_CPI', 'TOT_IR', 'AVG_IR', 'MIN_IR', 'MAX_IR',
                'TOT_LOI', 'AVG_LOI', 'MIN_LOI', 'MAX_LOI', 
                'WEEKDAY0_RES_RATE', 'WEEKDAY1_RES_RATE', 'WEEKDAY2_RES_RATE', 'WEEKDAY3_RES_RATE',
                'WEEKDAY4_RES_RATE', 'WEEKDAY5_RES_RATE', 'WEEKDAY6_RES_RATE',
                'HOURCLS0_RES_RATE', 'HOURCLS1_RES_RATE', 'HOURCLS2_RES_RATE', 'HOURCLS3_RES_RATE',
                'SURVEY_HOURCLS_RES_RATE']

In [12]:
cat_features = list(set(features.columns) - set(num_features))
cat_features

['SQ4',
 'B4',
 'B1',
 'B5',
 'SQ5',
 '대상지역',
 '대상자유형',
 'A1',
 'TYPE',
 'REGION',
 'WEEKDAY',
 'SQ8',
 'SQ6',
 'B3',
 'B2',
 'GENDER',
 'SQ7']

In [13]:
features[cat_features+['BIRTH']] = features[cat_features+['BIRTH']].fillna(features[cat_features].mode())

In [14]:
# num feature 중 0으로 채울 대상
apply_zero_features = ['SQ_R', 'A_R', 'B_R', 'C_R', 'D_R', 'F_R', 'H_R', 'T_R', 'X_R', 'ALL_R',
                       'RES_RATE', 'RES_RATE_3M',
                       'TOT_CPI', 'AVG_CPI', 'MIN_CPI', 'MAX_CPI', 
                       'TOT_IR', 'AVG_IR', 'MIN_IR', 'MAX_IR',
                       'TOT_LOI', 'AVG_LOI', 'MIN_LOI', 'MAX_LOI',
                       'WEEKDAY0_RES_RATE', 'WEEKDAY1_RES_RATE', 'WEEKDAY2_RES_RATE', 'WEEKDAY3_RES_RATE',
                       'WEEKDAY4_RES_RATE', 'WEEKDAY5_RES_RATE', 'WEEKDAY6_RES_RATE',
                       'HOURCLS0_RES_RATE', 'HOURCLS1_RES_RATE', 'HOURCLS2_RES_RATE', 'HOURCLS3_RES_RATE'                       
                      ]
features[apply_zero_features] = features[apply_zero_features].fillna(0)

###### 응답율 추이 파생변수 추가 

In [15]:
# 응답율 추이 변수 추가
features['RES_RATE_TREND'] = (features['RES_RATE_3M'] - features['RES_RATE'])# / features['RES_RATE']

In [16]:
num_features.extend(['RES_RATE_TREND'])

In [17]:
#0으로 채우는게 맞을까...유지하는 것이니...맞다고 볼 수 있을듯
features[['RES_RATE_TREND']] = features[['RES_RATE_TREND']].fillna(0)

In [18]:
features['GENDER'] = features['GENDER'].astype(int)
features['REGION'] = features['REGION'].astype(int)

In [19]:
features.columns

Index(['BIRTH', 'GENDER', 'REGION', 'TYPE', 'SQ4', 'SQ5', 'SQ6', 'SQ7', 'SQ8',
       'A1', 'B1', 'B2', 'B3', 'B4', 'B5', 'SQ_R', 'A_R', 'B_R', 'C_R', 'D_R',
       'F_R', 'H_R', 'T_R', 'X_R', 'ALL_R', 'TITLE', 'IR', 'LOI', 'CPI',
       '대상지역', '대상자유형', 'MONTH', 'DAY', 'HOUR', 'WEEKDAY', 'RES_RATE',
       'RES_RATE_3M', 'TYPE_RES_RATE', 'TOT_CPI', 'AVG_CPI', 'MIN_CPI',
       'MAX_CPI', 'TOT_IR', 'AVG_IR', 'MIN_IR', 'MAX_IR', 'TOT_LOI', 'AVG_LOI',
       'MIN_LOI', 'MAX_LOI', 'WEEKDAY0_RES_RATE', 'WEEKDAY1_RES_RATE',
       'WEEKDAY2_RES_RATE', 'WEEKDAY3_RES_RATE', 'WEEKDAY4_RES_RATE',
       'WEEKDAY5_RES_RATE', 'WEEKDAY6_RES_RATE', 'HOURCLS0_RES_RATE',
       'HOURCLS1_RES_RATE', 'HOURCLS2_RES_RATE', 'HOURCLS3_RES_RATE',
       'SURVEY_HOURCLS_RES_RATE', 'RES_RATE_TREND'],
      dtype='object')

### Transform features (feature Scaling) 

In [20]:
# DNN 모델링에서는 StandardScaler을 주로 사용
sacling_features_col = features.columns.tolist()
scaler = StandardScaler()
features[num_features] = scaler.fit_transform(features[num_features])

### 범주형 변수 인코딩 

In [21]:
te_features = features.copy()
ohe_feature = features.copy()

In [22]:
encoder = TargetEncoder(cols=cat_features)
encoder.fit(te_features[cat_features].iloc[:train_idx, :], y_train)
te_features[cat_features] = encoder.transform(te_features[cat_features])

In [23]:
ohe_feature =  pd.get_dummies(ohe_feature, columns=cat_features)

In [24]:
X_train = ohe_feature.iloc[:train_idx, :]
X_test = ohe_feature.iloc[train_idx:, :]
y_train = y_train
cat_features=cat_features

# Modeling

In [25]:
from lightgbm import LGBMClassifier

# 모델
lgbm_clf = LGBMClassifier(random_state = 20182817,  n_estimators=500)

In [26]:
lgbm_clf.fit(X_train, y_train, eval_set=[(X_train, y_train)], eval_metric = ['auc','logloss'], 
        verbose=True, early_stopping_rounds=10)

[1]	training's auc: 0.910152	training's binary_logloss: 0.603552
Training until validation scores don't improve for 10 rounds
[2]	training's auc: 0.913853	training's binary_logloss: 0.564756
[3]	training's auc: 0.915119	training's binary_logloss: 0.532752
[4]	training's auc: 0.916724	training's binary_logloss: 0.505937
[5]	training's auc: 0.917621	training's binary_logloss: 0.483264
[6]	training's auc: 0.918332	training's binary_logloss: 0.464038
[7]	training's auc: 0.918803	training's binary_logloss: 0.447441
[8]	training's auc: 0.919775	training's binary_logloss: 0.432988
[9]	training's auc: 0.920366	training's binary_logloss: 0.420562
[10]	training's auc: 0.921411	training's binary_logloss: 0.409533
[11]	training's auc: 0.922023	training's binary_logloss: 0.399907
[12]	training's auc: 0.922592	training's binary_logloss: 0.39137
[13]	training's auc: 0.922924	training's binary_logloss: 0.384147
[14]	training's auc: 0.923339	training's binary_logloss: 0.377692
[15]	training's auc: 0.92

[125]	training's auc: 0.944465	training's binary_logloss: 0.288283
[126]	training's auc: 0.9445	training's binary_logloss: 0.288192
[127]	training's auc: 0.944564	training's binary_logloss: 0.288026
[128]	training's auc: 0.944618	training's binary_logloss: 0.28788
[129]	training's auc: 0.944678	training's binary_logloss: 0.287719
[130]	training's auc: 0.944746	training's binary_logloss: 0.287551
[131]	training's auc: 0.944848	training's binary_logloss: 0.287287
[132]	training's auc: 0.9449	training's binary_logloss: 0.287148
[133]	training's auc: 0.945003	training's binary_logloss: 0.28691
[134]	training's auc: 0.945069	training's binary_logloss: 0.286753
[135]	training's auc: 0.945169	training's binary_logloss: 0.286487
[136]	training's auc: 0.945217	training's binary_logloss: 0.286363
[137]	training's auc: 0.945257	training's binary_logloss: 0.286266
[138]	training's auc: 0.945296	training's binary_logloss: 0.286166
[139]	training's auc: 0.945344	training's binary_logloss: 0.286039
[

[248]	training's auc: 0.94934	training's binary_logloss: 0.275574
[249]	training's auc: 0.949366	training's binary_logloss: 0.275492
[250]	training's auc: 0.949395	training's binary_logloss: 0.275424
[251]	training's auc: 0.949418	training's binary_logloss: 0.275372
[252]	training's auc: 0.949453	training's binary_logloss: 0.275284
[253]	training's auc: 0.94947	training's binary_logloss: 0.275246
[254]	training's auc: 0.949494	training's binary_logloss: 0.275188
[255]	training's auc: 0.949513	training's binary_logloss: 0.275139
[256]	training's auc: 0.949526	training's binary_logloss: 0.275109
[257]	training's auc: 0.94957	training's binary_logloss: 0.274999
[258]	training's auc: 0.949594	training's binary_logloss: 0.274934
[259]	training's auc: 0.949637	training's binary_logloss: 0.274825
[260]	training's auc: 0.949652	training's binary_logloss: 0.274785
[261]	training's auc: 0.949667	training's binary_logloss: 0.274745
[262]	training's auc: 0.94969	training's binary_logloss: 0.274682

[371]	training's auc: 0.952136	training's binary_logloss: 0.268155
[372]	training's auc: 0.952144	training's binary_logloss: 0.268133
[373]	training's auc: 0.952157	training's binary_logloss: 0.2681
[374]	training's auc: 0.952192	training's binary_logloss: 0.268008
[375]	training's auc: 0.952208	training's binary_logloss: 0.26796
[376]	training's auc: 0.952215	training's binary_logloss: 0.267938
[377]	training's auc: 0.952225	training's binary_logloss: 0.267909
[378]	training's auc: 0.952239	training's binary_logloss: 0.26787
[379]	training's auc: 0.952257	training's binary_logloss: 0.267817
[380]	training's auc: 0.952271	training's binary_logloss: 0.267772
[381]	training's auc: 0.952285	training's binary_logloss: 0.267735
[382]	training's auc: 0.952303	training's binary_logloss: 0.267685
[383]	training's auc: 0.952321	training's binary_logloss: 0.267633
[384]	training's auc: 0.952371	training's binary_logloss: 0.2675
[385]	training's auc: 0.952381	training's binary_logloss: 0.267463
[

[494]	training's auc: 0.954269	training's binary_logloss: 0.262323
[495]	training's auc: 0.954275	training's binary_logloss: 0.262305
[496]	training's auc: 0.954284	training's binary_logloss: 0.262278
[497]	training's auc: 0.954304	training's binary_logloss: 0.262218
[498]	training's auc: 0.95432	training's binary_logloss: 0.262172
[499]	training's auc: 0.954337	training's binary_logloss: 0.262126
[500]	training's auc: 0.954351	training's binary_logloss: 0.262087
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.954351	training's binary_logloss: 0.262087


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_leaves=31, objective=None,
               random_state=20182817, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [28]:
pd.DataFrame({'ID':ID, 'STATUS': lgbm_clf.predict_proba(X_test)[:,1]}).to_csv(os.path.abspath("./e_submission") + "/new_lgbm.csv", index=False)