In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(42)

from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score

# 1. 데이터 로드

In [2]:
train=pd.read_csv('input/train.csv', index_col=0) #인덱스 col=0도 check!
test=pd.read_csv('input/test_x.csv', index_col=0)
submission=pd.read_csv('input/sample_submission.csv', index_col=0)  

In [None]:
train.shape, test.shape, submission.shape

In [None]:
train.head(3)
test.head(3)
submission.head(3)

# 2. EDA & 전처리

In [None]:
df = pd.concat([train, test])
df.shape

In [None]:
train.info

## Missing Value

In [None]:
train.isnull().sum().sum()
test.isnull().sum().sum()

## Target 분포

In [None]:
f, ax = plt.subplots(1, 2, figsize=(10, 4))

train['voted'].value_counts().plot.pie(autopct='%1.1f%%', ax=ax[0], shadow=True)
sns.countplot('voted', data=train, ax=ax[1])

plt.show()

# 균일

## Outlier

In [None]:
test.familysize.max(), test.familysize.mean(), train[train.familysize < 30]['familysize'].mean()

In [None]:
train[train.familysize > 20]['familysize'].value_counts().sort_values(ascending=False)

In [3]:
train = train[train.familysize < 25]

In [None]:
train['familysize'].value_counts()

## 인코딩

In [4]:
#age_group, gender, race, religion
train['age_group'] = train['age_group'].str.replace('s','').astype('int')
train['gender']= pd.factorize(train['gender'])[0]
train['race']=pd.factorize(train['race'])[0]
train['religion']=pd.factorize(train['religion'])[0]

test['age_group'] = test['age_group'].str.replace('s','').astype('int')
test['gender']=pd.factorize(test['gender'])[0]
test['race']=pd.factorize(test['race'])[0]
test['religion']=pd.factorize(test['religion'])[0]

### One-hot

In [5]:
train = pd.get_dummies(train, columns=['age_group', 'education', 'engnat', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'])
train

Unnamed: 0_level_0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,...,wr_09_0,wr_09_1,wr_10_0,wr_10_1,wr_11_0,wr_11_1,wr_12_0,wr_12_1,wr_13_0,wr_13_1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,363,4.0,1370,5.0,997,1.0,1024,2.0,1577,...,1,0,0,1,1,0,0,1,0,1
1,5.0,647,5.0,1313,3.0,3387,5.0,2969,1.0,4320,...,1,0,0,1,1,0,0,1,0,1
2,4.0,1623,1.0,1480,1.0,1021,4.0,3374,5.0,1333,...,0,1,0,1,1,0,0,1,0,1
3,3.0,504,3.0,2311,4.0,992,3.0,3245,1.0,357,...,1,0,0,1,1,0,0,1,0,1
4,1.0,927,1.0,707,5.0,556,2.0,1062,1.0,1014,...,1,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45527,2.0,1050,5.0,619,4.0,328,1.0,285,1.0,602,...,0,1,0,1,1,0,0,1,0,1
45528,2.0,581,3.0,1353,4.0,1164,1.0,798,3.0,1680,...,0,1,0,1,1,0,0,1,0,1
45529,4.0,593,1.0,857,1.0,1047,4.0,1515,5.0,1690,...,1,0,0,1,1,0,0,1,0,1
45530,1.0,747,3.0,1331,4.0,892,2.0,1281,1.0,1328,...,0,1,0,1,1,0,0,1,0,1


In [6]:
test = pd.get_dummies(test, columns=['age_group', 'education', 'engnat', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'])
test

Unnamed: 0_level_0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,...,wr_09_0,wr_09_1,wr_10_0,wr_10_1,wr_11_0,wr_11_1,wr_12_0,wr_12_1,wr_13_0,wr_13_1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,736,2.0,2941,3.0,4621,1.0,4857,2.0,2550,...,1,0,0,1,1,0,0,1,0,1
1,3.0,514,2.0,1952,3.0,1552,3.0,821,4.0,1150,...,1,0,1,0,1,0,1,0,1,0
2,3.0,500,2.0,2507,4.0,480,2.0,614,2.0,1326,...,1,0,0,1,1,0,0,1,0,1
3,1.0,669,1.0,1050,5.0,1435,2.0,2252,5.0,2533,...,0,1,0,1,0,1,0,1,0,1
4,2.0,499,1.0,1243,5.0,845,2.0,1666,2.0,925,...,1,0,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11378,5.0,427,5.0,1066,5.0,588,1.0,560,2.0,1110,...,1,0,0,1,1,0,0,1,0,1
11379,1.0,314,5.0,554,5.0,230,1.0,956,2.0,1173,...,0,1,0,1,0,1,0,1,0,1
11380,1.0,627,2.0,799,1.0,739,2.0,1123,1.0,829,...,1,0,0,1,1,0,0,1,0,1
11381,2.0,539,1.0,2090,2.0,4642,1.0,673,2.0,1185,...,1,0,0,1,0,1,0,1,1,0


In [None]:
train.shape, test.shape
train.columns.values
test.columns.values

# 3. 모델 학습

In [7]:
from pycaret.classification import *

In [8]:
import gc
gc.collect()

44

In [9]:
%%time
clf = setup(data = train, target = 'voted')

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,3711
1,Target Type,Binary
2,Label Encoded,"1: 0, 2: 1"
3,Original Data,"(45524, 202)"
4,Missing Values,False
5,Numeric Features,201
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 27 s


In [10]:
%%time
best_3 = compare_models(sort = 'AUC', n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.6918,0.7659,0.6554,0.7496,0.6992,0.3861,0.3899,19.9323
1,Gradient Boosting Classifier,0.6946,0.7654,0.6408,0.7628,0.6964,0.3938,0.4,24.148
2,Light Gradient Boosting Machine,0.6915,0.7632,0.6451,0.7551,0.6957,0.3867,0.3917,1.154
3,Linear Discriminant Analysis,0.6924,0.7623,0.6616,0.7471,0.7017,0.3867,0.3898,0.929
4,Extra Trees Classifier,0.6892,0.761,0.6453,0.7513,0.6942,0.3818,0.3865,3.6488
5,Ada Boost Classifier,0.6903,0.7571,0.6488,0.751,0.6961,0.3838,0.3881,5.8488
6,Extreme Gradient Boosting,0.6772,0.7479,0.6668,0.7218,0.6931,0.3537,0.3551,11.5542
7,Random Forest Classifier,0.6545,0.711,0.6035,0.7195,0.6564,0.3143,0.3192,0.4325
8,Decision Tree Classifier,0.6127,0.6096,0.6426,0.6467,0.6446,0.219,0.2191,1.8192
9,Logistic Regression,0.5465,0.5905,0.9936,0.547,0.7055,0.0006,0.0033,1.1526


Wall time: 23min 28s


In [None]:
best_3

In [11]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7032,0.7741,0.668,0.76,0.7111,0.4085,0.4121
1,0.6953,0.767,0.6416,0.7634,0.6972,0.3951,0.4012
2,0.6906,0.7669,0.633,0.761,0.6911,0.3863,0.3929
3,0.6951,0.7647,0.6508,0.7575,0.7001,0.3937,0.3984
4,0.6951,0.7683,0.6425,0.7626,0.6974,0.3947,0.4006
Mean,0.6959,0.7682,0.6472,0.7609,0.6994,0.3957,0.401
SD,0.0041,0.0032,0.0118,0.0021,0.0065,0.0072,0.0063


In [12]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.697,0.7657,0.6522,0.7598,0.7019,0.3975,0.4023


In [None]:
final_model = finalize_model(blended)

In [None]:
predictions = predict_model(final_model, data = test)

In [None]:
submission['voted'] = predictions['Score']

In [None]:
submission.to_csv('output/20200928-2.csv')

## train, valid 데이터

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('voted', axis = 1), train['voted'], test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## 하이퍼파라미터 튜닝

In [None]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, plot_importance

colsample_bytree': 0.9606396547156734
, 'max_bin': 164.7465545875555
, 'max_depth': 11.107229265820779
, 'min_child_samples': 198.98606348612637
, 'min_child_weight': 41.32364031573033
, 'num_leaves': 49.45519154750152
, 'reg_alpha': 0.8758415725587487
, 'reg_lambda': 8.299355787002883
, 'subsample': 0.6598579157665245}}

In [None]:
# parameter 별로 search할 범위를 설정. 
bayesian_params = {
    'max_depth': (6, 16), 
    'num_leaves': (10, 100), 
    'min_child_samples': (10, 500), 
    'min_child_weight':(1, 100),
    'subsample':(0.1, 1.0),
    'colsample_bytree': (0.1, 1.0),
    'max_bin':(10, 1000),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.01, 50) 
}

In [None]:
def lgb_roc_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        #"n_estimators":500, "learning_rate":0.02,
        "n_estimators":1000, "learning_rate":0.001,
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0)
    }
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric= 'auc', verbose= 100, 
                early_stopping_rounds= 100)
    valid_proba = lgb_model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, valid_proba)
    
    return roc_auc   

In [None]:
import gc
gc.collect()

In [None]:
%%time
# BayesianOptimization객체를 수행할 함수와 search할 parameter 범위를 설정하여 생성. 
lgbBO = BayesianOptimization(lgb_roc_eval,bayesian_params , random_state=42)
# 함수 반환값이 최대가 되는 입력값 유추를 위한 iteration 수행. 
lgbBO.maximize(init_points=10, n_iter=50)

In [None]:
target_list = []
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmax(np.array(target_list)))

# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

* test_size=0.3
{'target': 0.7655615473679408, 'params': {'colsample_bytree': 0.737265320016441, 'max_bin': 30.378649352844423, 'max_depth': 15.699098521619943, 'min_child_samples': 417.8968939922066, 'min_child_weight': 22.02157195714934, 'num_leaves': 26.364247048639054, 'reg_alpha': 9.178391447573157, 'reg_lambda': 3.043118187352418, 'subsample': 0.5722807884690141}}

- outlier 삭제 
{'target': 0.7652209584768751, 'params': {'colsample_bytree': 0.5703824815639528, 'max_bin': 15.404309116854074, 'max_depth': 13.17919661428763, 'min_child_samples': 24.998278489695448, 'min_child_weight': 97.74061068693372, 'num_leaves': 99.35614131954165, 'reg_alpha': 1.2926370490809749, 'reg_lambda': 1.1059679925299122, 'subsample': 0.4274854420066033}}

- outlier 2.6 
{'target': 0.7655617202045278, 'params': {'colsample_bytree': 0.737265320016441, 'max_bin': 30.378649352844423, 'max_depth': 15.699098521619943, 'min_child_samples': 417.8968939922066, 'min_child_weight': 22.02157195714934, 'num_leaves': 26.364247048639054, 'reg_alpha': 9.178391447573157, 'reg_lambda': 3.043118187352418, 'subsample': 0.5722807884690141}}

In [None]:
clf = LGBMClassifier(
        n_jobs=-1
        ,nthread=4
        ,n_estimators=1000
        ,learning_rate=0.02
        ,max_depth = 16
        ,num_leaves=47
        ,colsample_bytree=0.56
        ,subsample=0.55
        ,max_bin=30
        ,reg_alpha=0.23
        ,reg_lambda=0.03
        ,min_child_weight=70
        ,min_child_samples=169
        ,silent=-1
        ,verbose=-1
        ,random_state=42
        )

clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], 
eval_metric= 'auc', verbose= 100, early_stopping_rounds= 50)

In [None]:
plot_importance(clf, figsize=(16, 32))

# 4. 예측

In [None]:
pred_y  = clf.predict(test)

In [None]:
submission['voted'] = pred_y 

In [None]:
submission

# 5. 제출

In [None]:
submission.to_csv('output/20200928-1.csv')