In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(42)

from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from sklearn.metrics import confusion_matrix, accuracy_score

# 1. 데이터 로드

In [2]:
train=pd.read_csv('input/train.csv', index_col=0) #인덱스 col=0도 check!
test=pd.read_csv('input/test_x.csv', index_col=0)
submission=pd.read_csv('input/sample_submission.csv', index_col=0)  

In [None]:
train.shape, test.shape, submission.shape

In [None]:
train.head(3)
test.head(3)
submission.head(3)

# 2. EDA & 전처리

In [None]:
df = pd.concat([train, test])
df.shape

In [None]:
train.info

## Missing Value

In [None]:
train.isnull().sum().sum()
test.isnull().sum().sum()

## Target 분포

In [None]:
f, ax = plt.subplots(1, 2, figsize=(10, 4))

train['voted'].value_counts().plot.pie(autopct='%1.1f%%', ax=ax[0], shadow=True)
sns.countplot('voted', data=train, ax=ax[1])

plt.show()

# 균일

## Outlier

In [None]:
test.familysize.max(), test.familysize.mean(), train[train.familysize < 30]['familysize'].mean()

In [None]:
train[train.familysize > 20]['familysize'].value_counts().sort_values(ascending=False)

In [3]:
train.loc[train[train.familysize > 25].index, 'familysize'] = 2.6

In [None]:
train['familysize'].value_counts()

## 인코딩

In [None]:
#age_group, gender, race, religion
train['age_group'] = train['age_group'].str.replace('s','').astype('int')
train['gender']= pd.factorize(train['gender'])[0]
train['race']=pd.factorize(train['race'])[0]
train['religion']=pd.factorize(train['religion'])[0]

test['age_group'] = test['age_group'].str.replace('s','').astype('int')
test['gender']=pd.factorize(test['gender'])[0]
test['race']=pd.factorize(test['race'])[0]
test['religion']=pd.factorize(test['religion'])[0]

### One-hot

In [None]:
train = pd.get_dummies(train, columns=['age_group', 'education', 'engnat', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'])
train

In [None]:
test = pd.get_dummies(test, columns=['age_group', 'education', 'engnat', 'gender', 'hand',
       'married', 'race', 'religion', 'tp01', 'tp02', 'tp03', 'tp04', 'tp05',
       'tp06', 'tp07', 'tp08', 'tp09', 'tp10', 'urban', 'wf_01',
       'wf_02', 'wf_03', 'wr_01', 'wr_02', 'wr_03', 'wr_04', 'wr_05', 'wr_06',
       'wr_07', 'wr_08', 'wr_09', 'wr_10', 'wr_11', 'wr_12', 'wr_13'])
test

In [None]:
train.shape, test.shape
train.columns.values
test.columns.values

# 3. 모델 학습

In [4]:
from pycaret.classification import *

In [5]:
import gc
gc.collect()

24

In [6]:
%%time
clf = setup(data = train, target = 'voted', polynomial_features=True
           , categorical_features=['age_group','gender','race','religion'])

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,3931
1,Target Type,Binary
2,Label Encoded,"1: 0, 2: 1"
3,Original Data,"(45532, 77)"
4,Missing Values,False
5,Numeric Features,41
6,Categorical Features,35
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 34.5 s


In [7]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Gradient Boosting Classifier,0.6933,0.7626,0.639,0.7618,0.695,0.3914,0.3975,30.2455
1,Light Gradient Boosting Machine,0.6918,0.7614,0.6418,0.7575,0.6948,0.3878,0.3932,1.4747
2,CatBoost Classifier,0.6912,0.7618,0.6509,0.7511,0.6974,0.3854,0.3896,23.6733
3,Linear Discriminant Analysis,0.6892,0.759,0.6567,0.7447,0.6979,0.3805,0.3837,0.9534
4,Ridge Classifier,0.6891,0.0,0.6569,0.7445,0.6979,0.3803,0.3835,0.1794
5,Extra Trees Classifier,0.6883,0.7554,0.6424,0.7514,0.6926,0.3803,0.3851,3.6857
6,Ada Boost Classifier,0.6875,0.7548,0.649,0.7463,0.6942,0.3778,0.3817,7.3276
7,Extreme Gradient Boosting,0.6742,0.7437,0.662,0.7197,0.6896,0.348,0.3494,13.1998
8,Random Forest Classifier,0.6516,0.7069,0.6009,0.7163,0.6535,0.3085,0.3134,0.4655
9,Decision Tree Classifier,0.6113,0.6081,0.6421,0.6453,0.6436,0.216,0.2161,2.3857


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=3931, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [12]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors.KNeighborsClassifier,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model.SGDClassifier,True
rbfsvm,SVM - Radial Kernel,sklearn.svm.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process.GPC,False
mlp,MLP Classifier,sklearn.neural_network.MLPClassifier,False
ridge,Ridge Classifier,sklearn.linear_model.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble.RandomForestClassifier,True


In [10]:
gbc  = create_model('gbc')    

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6976,0.7649,0.6466,0.7641,0.7004,0.3994,0.4051
1,0.6948,0.7611,0.6351,0.7666,0.6947,0.3949,0.4019
2,0.6856,0.7571,0.6177,0.762,0.6823,0.3778,0.386
3,0.7016,0.7714,0.651,0.7678,0.7046,0.4072,0.4129
4,0.7113,0.7805,0.6546,0.782,0.7127,0.4272,0.434
5,0.6909,0.7545,0.6397,0.7575,0.6936,0.3863,0.3919
6,0.7057,0.7796,0.6586,0.77,0.71,0.4149,0.4201
7,0.6787,0.7452,0.6231,0.7474,0.6796,0.3627,0.3687
8,0.6843,0.7521,0.6363,0.7488,0.688,0.3728,0.3779
9,0.6828,0.7593,0.6271,0.7517,0.6838,0.3707,0.3769


In [13]:
tuned_gbc = tune_model(gbc)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6835,0.7554,0.6695,0.7294,0.6982,0.3668,0.3683
1,0.6807,0.7474,0.6701,0.725,0.6965,0.3607,0.362
2,0.6768,0.7387,0.659,0.7247,0.6903,0.354,0.3557
3,0.6856,0.7508,0.6699,0.7321,0.6996,0.3712,0.3728
4,0.6869,0.7582,0.6701,0.7341,0.7007,0.3738,0.3755
5,0.663,0.7343,0.6604,0.7048,0.6819,0.3244,0.3252
6,0.6916,0.7627,0.6862,0.7328,0.7087,0.3817,0.3827
7,0.6545,0.7234,0.638,0.7029,0.6689,0.3095,0.3111
8,0.6765,0.7507,0.6758,0.7165,0.6956,0.3511,0.3518
9,0.6621,0.7342,0.6489,0.7086,0.6774,0.324,0.3254


In [14]:
lightgbm  = create_model('lightgbm') 
tuned_lightgbm = tune_model(lightgbm)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6766,0.7538,0.654,0.727,0.6886,0.3542,0.3563
1,0.6735,0.7452,0.6569,0.721,0.6875,0.3472,0.3488
2,0.6734,0.7368,0.6533,0.7225,0.6862,0.3474,0.3493
3,0.6806,0.7572,0.6636,0.728,0.6943,0.3614,0.3631
4,0.684,0.7583,0.669,0.7306,0.6984,0.368,0.3695
5,0.6821,0.7502,0.6707,0.727,0.6977,0.3637,0.3651
6,0.6803,0.7565,0.6781,0.7207,0.6988,0.3588,0.3595
7,0.6586,0.7242,0.6443,0.7058,0.6737,0.3173,0.3188
8,0.6737,0.7467,0.6592,0.7204,0.6884,0.3473,0.3488
9,0.6643,0.7358,0.6374,0.7172,0.675,0.3303,0.3327


In [15]:
catboost  = create_model('catboost') 
tuned_catboost = tune_model(catboost)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6964,0.7674,0.6523,0.7585,0.7014,0.3961,0.4008
1,0.6907,0.7595,0.6472,0.7525,0.6959,0.3849,0.3894
2,0.6903,0.7594,0.6269,0.7642,0.6887,0.3865,0.394
3,0.6963,0.7713,0.6493,0.7601,0.7003,0.3963,0.4013
4,0.7085,0.7796,0.6592,0.7743,0.7121,0.4207,0.4263
5,0.6922,0.7546,0.6483,0.7543,0.6973,0.3878,0.3924
6,0.7016,0.7794,0.6655,0.7592,0.7093,0.4055,0.4092
7,0.6784,0.7451,0.6259,0.7452,0.6804,0.3616,0.3672
8,0.6787,0.7516,0.6397,0.7379,0.6853,0.3605,0.3643
9,0.6812,0.7542,0.6311,0.7468,0.6841,0.3669,0.3722


In [16]:
turn_3 = [tuned_gbc, tuned_lightgbm, tuned_catboost]

In [None]:
%%time
best_3 = compare_models(sort = 'AUC', n_select = 3)

In [17]:
#blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')
blended = blend_models(estimator_list = turn_3, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6905,0.7604,0.6517,0.7496,0.6973,0.3839,0.3878
1,0.6883,0.7574,0.6446,0.7503,0.6934,0.3801,0.3847
2,0.6905,0.7601,0.6551,0.7475,0.6983,0.3833,0.3869
3,0.6834,0.755,0.6545,0.737,0.6933,0.3685,0.3713
4,0.6801,0.7533,0.6483,0.7354,0.6891,0.3624,0.3654
Mean,0.6866,0.7572,0.6508,0.744,0.6943,0.3756,0.3792
SD,0.0041,0.0028,0.004,0.0064,0.0033,0.0087,0.0091


In [18]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.6953,0.7672,0.6561,0.7547,0.7019,0.3934,0.3975


In [19]:
final_model = finalize_model(blended)

In [None]:
final_model

In [None]:
predictions = predict_model(final_model, data = test)

In [None]:
submission['voted'] = predictions['Score']

In [None]:
submission.to_csv('output/20200929-1.csv')

## train, valid 데이터

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('voted', axis = 1), train['voted'], test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## 하이퍼파라미터 튜닝

In [None]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier, plot_importance

colsample_bytree': 0.9606396547156734
, 'max_bin': 164.7465545875555
, 'max_depth': 11.107229265820779
, 'min_child_samples': 198.98606348612637
, 'min_child_weight': 41.32364031573033
, 'num_leaves': 49.45519154750152
, 'reg_alpha': 0.8758415725587487
, 'reg_lambda': 8.299355787002883
, 'subsample': 0.6598579157665245}}

In [None]:
# parameter 별로 search할 범위를 설정. 
bayesian_params = {
    'max_depth': (6, 16), 
    'num_leaves': (10, 100), 
    'min_child_samples': (10, 500), 
    'min_child_weight':(1, 100),
    'subsample':(0.1, 1.0),
    'colsample_bytree': (0.1, 1.0),
    'max_bin':(10, 1000),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.01, 50) 
}

In [None]:
def lgb_roc_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        #"n_estimators":500, "learning_rate":0.02,
        "n_estimators":1000, "learning_rate":0.001,
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0)
    }
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric= 'auc', verbose= 100, 
                early_stopping_rounds= 100)
    valid_proba = lgb_model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, valid_proba)
    
    return roc_auc   

In [None]:
import gc
gc.collect()

In [None]:
%%time
# BayesianOptimization객체를 수행할 함수와 search할 parameter 범위를 설정하여 생성. 
lgbBO = BayesianOptimization(lgb_roc_eval,bayesian_params , random_state=42)
# 함수 반환값이 최대가 되는 입력값 유추를 위한 iteration 수행. 
lgbBO.maximize(init_points=10, n_iter=50)

In [None]:
target_list = []
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmax(np.array(target_list)))

# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

* test_size=0.3
{'target': 0.7655615473679408, 'params': {'colsample_bytree': 0.737265320016441, 'max_bin': 30.378649352844423, 'max_depth': 15.699098521619943, 'min_child_samples': 417.8968939922066, 'min_child_weight': 22.02157195714934, 'num_leaves': 26.364247048639054, 'reg_alpha': 9.178391447573157, 'reg_lambda': 3.043118187352418, 'subsample': 0.5722807884690141}}

- outlier 삭제 
{'target': 0.7652209584768751, 'params': {'colsample_bytree': 0.5703824815639528, 'max_bin': 15.404309116854074, 'max_depth': 13.17919661428763, 'min_child_samples': 24.998278489695448, 'min_child_weight': 97.74061068693372, 'num_leaves': 99.35614131954165, 'reg_alpha': 1.2926370490809749, 'reg_lambda': 1.1059679925299122, 'subsample': 0.4274854420066033}}

- outlier 2.6 
{'target': 0.7655617202045278, 'params': {'colsample_bytree': 0.737265320016441, 'max_bin': 30.378649352844423, 'max_depth': 15.699098521619943, 'min_child_samples': 417.8968939922066, 'min_child_weight': 22.02157195714934, 'num_leaves': 26.364247048639054, 'reg_alpha': 9.178391447573157, 'reg_lambda': 3.043118187352418, 'subsample': 0.5722807884690141}}

In [None]:
clf = LGBMClassifier(
        n_jobs=-1
        ,nthread=4
        ,n_estimators=1000
        ,learning_rate=0.02
        ,max_depth = 16
        ,num_leaves=47
        ,colsample_bytree=0.56
        ,subsample=0.55
        ,max_bin=30
        ,reg_alpha=0.23
        ,reg_lambda=0.03
        ,min_child_weight=70
        ,min_child_samples=169
        ,silent=-1
        ,verbose=-1
        ,random_state=42
        )

clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], 
eval_metric= 'auc', verbose= 100, early_stopping_rounds= 50)

In [None]:
plot_importance(clf, figsize=(16, 32))

# 4. 예측

In [None]:
pred_y  = clf.predict(test)

In [None]:
submission['voted'] = pred_y 

In [None]:
submission

# 5. 제출

In [None]:
submission.to_csv('output/20200928-1.csv')