In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier




import os

In [2]:
# 데이터 불러오기
train = pd.read_csv('/kaggle/input/kakr-4th-competition/train.csv')
test = pd.read_csv('/kaggle/input/kakr-4th-competition/test.csv')
sample_submission = pd.read_csv('../input/kakr-4th-competition/sample_submission.csv')

label = train['income']

del train['income']

In [3]:
# 라벨 값 인코딩
label = label.map(lambda x: 1 if x == '>50K' else 0)

In [4]:
x_train, x_valid, y_train, y_valid = train_test_split(train, label, 
                                                          test_size=0.2,
                                                          random_state=20,
                                                          shuffle=True)

In [5]:
def preprocess(x_train, x_valid, x_test):
    
    global tmp_x_train
    global tmp_x_valid
    global tmp_x_test
    
    tmp_x_train = x_train.copy()
    tmp_x_valid = x_valid.copy()
    tmp_x_test  = x_test.copy()
    
    tmp_x_train = tmp_x_train.reset_index(drop=True)
    tmp_x_valid = tmp_x_valid.reset_index(drop=True)
    tmp_x_test  = tmp_x_test.reset_index(drop=True)
    
    # column 제거
    tmp_x_train.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    tmp_x_valid.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    tmp_x_test.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    
    # marital status
    tmp_x_train['marital_status'] = (tmp_x_train['marital_status'] == 'Married-civ-spouse').astype(int)
    tmp_x_valid['marital_status'] = (tmp_x_valid['marital_status'] == 'Married-civ-spouse').astype(int)
    tmp_x_test['marital_status'] = (tmp_x_test['marital_status'] == 'Married-civ-spouse').astype(int)
    
    # race
    tmp_x_train['race'] = ((tmp_x_train['race'] == 'White') | (tmp_x_train['race'] == 'Asian-Pac-Islander')).astype(int)
    tmp_x_valid['race'] = ((tmp_x_valid['race'] == 'White') | (tmp_x_valid['race'] == 'Asian-Pac-Islander')).astype(int)
    tmp_x_test['race'] = ((tmp_x_test['race'] == 'White') | (tmp_x_test['race'] == 'Asian-Pac-Islander')).astype(int)
    
    # capital_gain, loss
    tmp_x_train['cap_gain_high'] = (tmp_x_train['capital_gain'] != 0).astype(int)
    tmp_x_train['cap_loss_high'] = (tmp_x_train['capital_loss'] >= 1700).astype(int)
    tmp_x_train['capital_gain'] = tmp_x_train['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    tmp_x_valid['cap_gain_high'] = (tmp_x_valid['capital_gain'] != 0).astype(int)
    tmp_x_valid['cap_loss_high'] = (tmp_x_valid['capital_loss'] >= 1700).astype(int)
    tmp_x_valid['capital_gain'] = tmp_x_valid['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    tmp_x_test['cap_gain_high'] = (tmp_x_test['capital_gain'] != 0).astype(int)
    tmp_x_test['cap_loss_high'] = (tmp_x_test['capital_loss'] >= 1700).astype(int)
    tmp_x_test['capital_gain'] = tmp_x_test['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    # age
    tmp_x_train.loc[tmp_x_train['age'] < 20, 'age_range'] = '~20'
    tmp_x_train.loc[tmp_x_train['age'] >= 65, 'age_range'] = '~65'
    down = 20
    for i in range(45//5):
        tmp_x_train.loc[(tmp_x_train['age'] >= down) & (tmp_x_train['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5
    tmp_x_train['age'] = tmp_x_train['age_range']
    tmp_x_train.drop(['age_range'], axis=1, inplace=True)
    
    tmp_x_valid.loc[tmp_x_valid['age'] < 20, 'age_range'] = '~20'
    tmp_x_valid.loc[tmp_x_valid['age'] >= 65, 'age_range'] = '~65'
    down = 20
    for i in range(45//5):
        tmp_x_valid.loc[(tmp_x_valid['age'] >= down) & (tmp_x_valid['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5
    tmp_x_valid['age'] = tmp_x_valid['age_range']
    tmp_x_valid.drop(['age_range'], axis=1, inplace=True)
    
    tmp_x_test.loc[tmp_x_test['age'] < 20, 'age_range'] = '~20'
    tmp_x_test.loc[tmp_x_test['age'] >= 65, 'age_range'] = '~65'
    down = 20
    for i in range(45//5):
        tmp_x_test.loc[(tmp_x_test['age'] >= down) & (tmp_x_test['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5
    tmp_x_test['age'] = tmp_x_test['age_range']
    tmp_x_test.drop(['age_range'], axis=1, inplace=True)
        
    # edu_num
    tmp_x_train['edu_num_high'] = (tmp_x_train['education_num'] >= 13).astype(int)
    tmp_x_valid['edu_num_high'] = (tmp_x_valid['education_num'] >= 13).astype(int)
    tmp_x_test['edu_num_high'] = (tmp_x_test['education_num'] >= 13).astype(int)
    
    # hours-per-week
    tmp_x_train['hpw_high'] = (tmp_x_train['hours_per_week'] >= 50).astype(int)
    tmp_x_valid['hpw_high'] = (tmp_x_valid['hours_per_week'] >= 50).astype(int)
    tmp_x_test['hpw_high'] = (tmp_x_test['hours_per_week'] >= 50).astype(int)
    
    # min-max scaler
    mmscaler = MinMaxScaler()
    tmp_x_train['education_num'] = mmscaler.fit_transform(tmp_x_train['education_num'].values.reshape(-1,1))
    tmp_x_valid['education_num'] = mmscaler.transform(tmp_x_valid['education_num'].values.reshape(-1,1))
    tmp_x_test['education_num'] = mmscaler.transform(tmp_x_test['education_num'].values.reshape(-1,1))
    
    tmp_x_train['hours_per_week'] = mmscaler.transform(tmp_x_train['hours_per_week'].values.reshape(-1,1))
    tmp_x_valid['hours_per_week'] = mmscaler.transform(tmp_x_valid['hours_per_week'].values.reshape(-1,1))
    tmp_x_test['hours_per_week'] = mmscaler.transform(tmp_x_test['hours_per_week'].values.reshape(-1,1))

    
    # ohe
    tmp_all = pd.concat([tmp_x_train, tmp_x_valid, tmp_x_test])
    
    ohe = OneHotEncoder(sparse=False)
    cat_columns = ['age', 'marital_status', 'occupation', 'race', 'sex']
    ohe.fit(tmp_all[cat_columns])
    
    
    ohe_columns = list()
    for lst in ohe.categories_:
        ohe_columns += lst.tolist()
    
    tmp_train_cat = pd.DataFrame(ohe.transform(tmp_x_train[cat_columns]), columns=ohe_columns)
    tmp_valid_cat = pd.DataFrame(ohe.transform(tmp_x_valid[cat_columns]), columns=ohe_columns)
    tmp_test_cat  = pd.DataFrame(ohe.transform(tmp_x_test[cat_columns]), columns=ohe_columns)
    
    tmp_train_cat.columns = ohe.get_feature_names(cat_columns)
    tmp_valid_cat.columns = ohe.get_feature_names(cat_columns)
    tmp_test_cat.columns = ohe.get_feature_names(cat_columns)
    
    tmp_x_train = pd.concat([tmp_x_train, tmp_train_cat], axis=1)
    tmp_x_valid = pd.concat([tmp_x_valid, tmp_valid_cat], axis=1)
    tmp_x_test = pd.concat([tmp_x_test, tmp_test_cat], axis=1)

    tmp_x_train = tmp_x_train.drop(columns=cat_columns)
    tmp_x_valid = tmp_x_valid.drop(columns=cat_columns)
    tmp_x_test = tmp_x_test.drop(columns=cat_columns)
        
#     # get_dummies
#     tmp_x_train = pd.get_dummies(tmp_x_train, columns = ['age', 'marital_status', 'occupation', 'race', 'sex'])
#     tmp_x_valid = pd.get_dummies(tmp_x_valid, columns = ['age', 'marital_status', 'occupation', 'race', 'sex'])
#     tmp_x_test = pd.get_dummies(tmp_x_test, columns = ['age', 'marital_status', 'occupation', 'race', 'sex'])
    
    
    return tmp_x_train.values, tmp_x_valid.values, tmp_x_test.values

In [6]:
preprocess(x_train, x_valid, test)

(array([[0.66666667, 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ],
        [0.53333333, 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ],
        [0.53333333, 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ],
        ...,
        [0.8       , 0.        , 0.        , ..., 1.        , 1.        ,
         0.        ],
        [0.8       , 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ],
        [0.53333333, 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ]]),
 array([[0.53333333, 0.        , 0.        , ..., 1.        , 1.        ,
         0.        ],
        [0.86666667, 0.        , 0.        , ..., 0.        , 0.        ,
         1.        ],
        [0.6       , 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ],
        ...,
        [0.53333333, 0.        , 0.        , ..., 1.        , 1.        ,
         0.        ],
        [0.8

# Simple LGBM

In [7]:
lgb = LGBMClassifier(tree_method='gpu_hist')

lgb.fit(tmp_x_train, y_train)

y_pred = lgb.predict(tmp_x_valid)

print(f"LightGBM F1 Score: {f1_score(y_valid, y_pred, average='micro')}")

LightGBM F1 Score: 0.8690978886756238


In [8]:
# light GBM
from sklearn.model_selection import StratifiedKFold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)


val_scores = list()
oof_pred = np.zeros((test.shape[0], )) # 이것도 달라짐

for i, (trn_idx, val_idx) in enumerate(skf.split(train, label)):
    x_train, y_train = train.iloc[trn_idx, :], label[trn_idx]
    x_valid, y_valid = train.iloc[val_idx, :], label[val_idx]
    
    # 전처리
    x_train, x_valid, x_test = preprocess(x_train, x_valid, test)
    
    # 모델 정의
    clf = LGBMClassifier(tree_method='gpu_hist')
    
    # 모델 학습
    clf.fit(x_train, y_train,
            eval_set = [[x_valid, y_valid]], 
            eval_metric = 'logloss',        
            early_stopping_rounds = 100,
            verbose = 100,  )

    # 훈련, 검증 데이터 F1 Score 확인
    trn_f1_score = f1_score(y_train, clf.predict(x_train), average='micro')
    val_f1_score = f1_score(y_valid, clf.predict(x_valid), average='micro')
    print('{} Fold, train f1_score : {:.4f}4, validation f1_score : {:.4f}\n'.format(i, trn_f1_score, val_f1_score))
    
    val_scores.append(val_f1_score)
    
    oof_pred += clf.predict_proba(x_test)[: , 1] / n_splits # 이게 달라진거임
    

# 교차 검증 F1 Score 평균 계산하기
print('Cross Validation Score : {:.4f}'.format(np.mean(val_scores)))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.280645
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.280645
0 Fold, train f1_score : 0.87984, validation f1_score : 0.8695

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.28152
Did not meet early stopping. Best iteration is:
[99]	valid_0's binary_logloss: 0.281508
1 Fold, train f1_score : 0.88144, validation f1_score : 0.8670

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.285252
Did not meet early stopping. Best iteration is:
[95]	valid_0's binary_logloss: 0.285136
2 Fold, train f1_score : 0.88094, validation f1_score : 0.8651

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.283011
Did not meet early stopping. Best iteration is:
[98]	valid_0's binary_logloss: 0.282902
3 Fold, train f1_score : 0.88134, validation

## Hyper-parameter tuning

In [63]:
# 책에 나와있는 것

lgbm_wrapper = LGBMClassifier(n_estimators = 400)

evals = [(tmp_x_valid, y_valid)]
lgbm_wrapper.fit(tmp_x_valid, y_valid, early_stopping_rounds = 100, eval_metric = 'logloss',
                eval_set = evals, verbose = True)
preds = lgbm_wrapper.predict(tmp_x_valid)

[1]	training's binary_logloss: 0.50894
Training until validation scores don't improve for 100 rounds
[2]	training's binary_logloss: 0.476117
[3]	training's binary_logloss: 0.449627
[4]	training's binary_logloss: 0.427759
[5]	training's binary_logloss: 0.409123
[6]	training's binary_logloss: 0.393423
[7]	training's binary_logloss: 0.380252
[8]	training's binary_logloss: 0.368393
[9]	training's binary_logloss: 0.357875
[10]	training's binary_logloss: 0.348628
[11]	training's binary_logloss: 0.340411
[12]	training's binary_logloss: 0.332938
[13]	training's binary_logloss: 0.32669
[14]	training's binary_logloss: 0.32097
[15]	training's binary_logloss: 0.315906
[16]	training's binary_logloss: 0.31084
[17]	training's binary_logloss: 0.306664
[18]	training's binary_logloss: 0.302686
[19]	training's binary_logloss: 0.299266
[20]	training's binary_logloss: 0.295322
[21]	training's binary_logloss: 0.291826
[22]	training's binary_logloss: 0.289123
[23]	training's binary_logloss: 0.286385
[24]	tra

[325]	training's binary_logloss: 0.161267
[326]	training's binary_logloss: 0.16105
[327]	training's binary_logloss: 0.160928
[328]	training's binary_logloss: 0.160811
[329]	training's binary_logloss: 0.16064
[330]	training's binary_logloss: 0.160394
[331]	training's binary_logloss: 0.160311
[332]	training's binary_logloss: 0.160216
[333]	training's binary_logloss: 0.160066
[334]	training's binary_logloss: 0.159966
[335]	training's binary_logloss: 0.159817
[336]	training's binary_logloss: 0.159501
[337]	training's binary_logloss: 0.159335
[338]	training's binary_logloss: 0.159197
[339]	training's binary_logloss: 0.159009
[340]	training's binary_logloss: 0.158859
[341]	training's binary_logloss: 0.158739
[342]	training's binary_logloss: 0.158617
[343]	training's binary_logloss: 0.158478
[344]	training's binary_logloss: 0.158418
[345]	training's binary_logloss: 0.158162
[346]	training's binary_logloss: 0.157948
[347]	training's binary_logloss: 0.157778
[348]	training's binary_logloss: 0.1

In [80]:
preds2 = lgbm_wrapper.predict(x_test)

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler, Binarizer
from sklearn.linear_model import LogisticRegression


def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))

In [82]:
get_clf_eval(y_valid, preds)

오차행렬:
 [[3836  112]
 [ 195 1066]]

정확도: 0.9411
정밀도: 0.9049
재현율: 0.8454
F1: 0.8741
AUC: 0.9085


In [86]:
sample_submission['prediction'] = preds2
sample_submission.to_csv('submission.csv', index=False)

# 0.84

# https://www.kaggle.com/mlisovyi/lightgbm-hyperparameter-optimisation-lb-0-761

### Set up HyperParameter search

In [164]:
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

In [165]:
import lightgbm as lgb
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(tmp_x_valid, y_valid)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

In [166]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [197]:
#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='f1_macro',
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

In [198]:
gs.fit(tmp_x_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Training until validation scores don't improve for 30 rounds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[100]	valid's auc: 0.917212
[200]	valid's auc: 0.91772
[300]	valid's auc: 0.918712
[400]	valid's auc: 0.919205
[500]	valid's auc: 0.919375
Early stopping, best iteration is:
[476]	valid's auc: 0.919436
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.917797
[200]	valid's auc: 0.918585
[300]	valid's auc: 0.919015
Early stopping, best iteration is:
[360]	valid's auc: 0.9191
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.916435
[200]	valid's auc: 0.917749
[300]	valid's auc: 0.918376
[400]	valid's auc: 0.918694
[500]	valid's auc: 0.918879
Early stopping, best iteration is:
[545]	valid's auc: 0.91899
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.916644
[200]	valid's auc: 0.917889
[300]	valid's auc: 0.918317
[400]	valid's auc: 0.918577
[500]	valid's auc: 0.918737
[600]	valid's auc: 0.918832
[700]	valid's auc: 0.918907
[800]	valid's auc: 0.918932
[900]	valid's auc: 0.918941
Early stopping, 

[200]	valid's auc: 0.917287
[300]	valid's auc: 0.917695
[400]	valid's auc: 0.918039
[500]	valid's auc: 0.918326
[600]	valid's auc: 0.918543
[700]	valid's auc: 0.918782
[800]	valid's auc: 0.918917
[900]	valid's auc: 0.919057
[1000]	valid's auc: 0.919179
[1100]	valid's auc: 0.919273
[1200]	valid's auc: 0.919355
[1300]	valid's auc: 0.919422
[1400]	valid's auc: 0.919487
[1500]	valid's auc: 0.919527
[1600]	valid's auc: 0.919579
[1700]	valid's auc: 0.919611
[1800]	valid's auc: 0.919644
[1900]	valid's auc: 0.919672
[2000]	valid's auc: 0.919698
[2100]	valid's auc: 0.919712
[2200]	valid's auc: 0.919728
Early stopping, best iteration is:
[2183]	valid's auc: 0.919732
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.915879
[200]	valid's auc: 0.916888
[300]	valid's auc: 0.917382
[400]	valid's auc: 0.917709
[500]	valid's auc: 0.917999
[600]	valid's auc: 0.918209
[700]	valid's auc: 0.918444
[800]	valid's auc: 0.918581
[900]	valid's auc: 0.918723
[1000]	valid's auc: 0.

[100]	valid's auc: 0.919075
[200]	valid's auc: 0.92084
[300]	valid's auc: 0.921964
Early stopping, best iteration is:
[368]	valid's auc: 0.922433
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.919663
[200]	valid's auc: 0.921016
[300]	valid's auc: 0.922134
[400]	valid's auc: 0.922825
[500]	valid's auc: 0.923115
Early stopping, best iteration is:
[552]	valid's auc: 0.923201
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.919782
[200]	valid's auc: 0.922051
[300]	valid's auc: 0.923395
[400]	valid's auc: 0.924271
[500]	valid's auc: 0.924517
Early stopping, best iteration is:
[490]	valid's auc: 0.924563
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.90788
Early stopping, best iteration is:
[134]	valid's auc: 0.907976
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.908083
Early stopping, best iteration is:
[157]	valid's auc: 0.908185
Training until validati

Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.914846
[200]	valid's auc: 0.916563
[300]	valid's auc: 0.916844
[400]	valid's auc: 0.917151
[500]	valid's auc: 0.91746
[600]	valid's auc: 0.917661
[700]	valid's auc: 0.917892
[800]	valid's auc: 0.918019
[900]	valid's auc: 0.918178
[1000]	valid's auc: 0.918292
[1100]	valid's auc: 0.918406
[1200]	valid's auc: 0.91849
[1300]	valid's auc: 0.918574
[1400]	valid's auc: 0.918666
[1500]	valid's auc: 0.918725
[1600]	valid's auc: 0.918794
[1700]	valid's auc: 0.918843
[1800]	valid's auc: 0.918923
[1900]	valid's auc: 0.918961
[2000]	valid's auc: 0.919024
[2100]	valid's auc: 0.919061
[2200]	valid's auc: 0.919107
[2300]	valid's auc: 0.919144
[2400]	valid's auc: 0.919186
[2500]	valid's auc: 0.919219
[2600]	valid's auc: 0.919255
[2700]	valid's auc: 0.919275
[2800]	valid's auc: 0.919309
[2900]	valid's auc: 0.919338
[3000]	valid's auc: 0.919365
[3100]	valid's auc: 0.919396
[3200]	valid's auc: 0.919417
[3300]	valid's auc: 0.

[1200]	valid's auc: 0.919247
[1300]	valid's auc: 0.919305
[1400]	valid's auc: 0.919364
[1500]	valid's auc: 0.919404
[1600]	valid's auc: 0.919412
Early stopping, best iteration is:
[1618]	valid's auc: 0.919412
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.91924
[200]	valid's auc: 0.921207
[300]	valid's auc: 0.921745
Early stopping, best iteration is:
[272]	valid's auc: 0.921874
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.920048
[200]	valid's auc: 0.921736
[300]	valid's auc: 0.921693
Early stopping, best iteration is:
[270]	valid's auc: 0.922009
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.920084
[200]	valid's auc: 0.922301
[300]	valid's auc: 0.922885
[400]	valid's auc: 0.923094
Early stopping, best iteration is:
[422]	valid's auc: 0.923219
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1]	valid's auc: 0.5
Training until validati

[2400]	valid's auc: 0.91921
[2500]	valid's auc: 0.91922
[2600]	valid's auc: 0.919226
[2700]	valid's auc: 0.919233
[2800]	valid's auc: 0.919237
[2900]	valid's auc: 0.919244
[3000]	valid's auc: 0.919251
[3100]	valid's auc: 0.919258
[3200]	valid's auc: 0.91926
Early stopping, best iteration is:
[3216]	valid's auc: 0.919261
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.915367
[200]	valid's auc: 0.915768
[300]	valid's auc: 0.916031
[400]	valid's auc: 0.916208
[500]	valid's auc: 0.916317
[600]	valid's auc: 0.916405
[700]	valid's auc: 0.916481
[800]	valid's auc: 0.916542
[900]	valid's auc: 0.916595
[1000]	valid's auc: 0.916638
[1100]	valid's auc: 0.916677
[1200]	valid's auc: 0.916708
[1300]	valid's auc: 0.916735
[1400]	valid's auc: 0.91676
[1500]	valid's auc: 0.916776
[1600]	valid's auc: 0.916793
[1700]	valid's auc: 0.916808
[1800]	valid's auc: 0.916829
[1900]	valid's auc: 0.916846
[2000]	valid's auc: 0.916859
[2100]	valid's auc: 0.916871
[2200]	valid's auc

[1300]	valid's auc: 0.919633
[1400]	valid's auc: 0.919682
[1500]	valid's auc: 0.919715
[1600]	valid's auc: 0.919751
[1700]	valid's auc: 0.919782
[1800]	valid's auc: 0.91982
[1900]	valid's auc: 0.919845
[2000]	valid's auc: 0.919867
[2100]	valid's auc: 0.919884
[2200]	valid's auc: 0.919909
[2300]	valid's auc: 0.919929
[2400]	valid's auc: 0.919951
[2500]	valid's auc: 0.919967
[2600]	valid's auc: 0.919984
[2700]	valid's auc: 0.919996
[2800]	valid's auc: 0.920008
[2900]	valid's auc: 0.920019
[3000]	valid's auc: 0.92003
[3100]	valid's auc: 0.920042
[3200]	valid's auc: 0.920049
Early stopping, best iteration is:
[3221]	valid's auc: 0.920053
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.915316
[200]	valid's auc: 0.917732
[300]	valid's auc: 0.918446
[400]	valid's auc: 0.918885
[500]	valid's auc: 0.919095
[600]	valid's auc: 0.919347
[700]	valid's auc: 0.919589
[800]	valid's auc: 0.919709
[900]	valid's auc: 0.919821
[1000]	valid's auc: 0.919886
[1100]	valid's a

Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.916077
[200]	valid's auc: 0.917209
[300]	valid's auc: 0.917786
[400]	valid's auc: 0.918113
[500]	valid's auc: 0.918381
[600]	valid's auc: 0.9186
[700]	valid's auc: 0.918802
[800]	valid's auc: 0.918946
[900]	valid's auc: 0.919045
[1000]	valid's auc: 0.919125
[1100]	valid's auc: 0.919199
[1200]	valid's auc: 0.919268
[1300]	valid's auc: 0.919312
[1400]	valid's auc: 0.919351
[1500]	valid's auc: 0.919393
[1600]	valid's auc: 0.91945
[1700]	valid's auc: 0.9195
[1800]	valid's auc: 0.919552
[1900]	valid's auc: 0.919594
[2000]	valid's auc: 0.919657
[2100]	valid's auc: 0.91971
[2200]	valid's auc: 0.919761
Early stopping, best iteration is:
[2222]	valid's auc: 0.919762
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.916976
[200]	valid's auc: 0.91789
[300]	valid's auc: 0.918453
[400]	valid's auc: 0.918758
[500]	valid's auc: 0.919074
[600]	valid's auc: 0.919312
[700]	valid's auc: 0.919

Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.915475
[200]	valid's auc: 0.916974
[300]	valid's auc: 0.917631
[400]	valid's auc: 0.918138
[500]	valid's auc: 0.918402
[600]	valid's auc: 0.918632
[700]	valid's auc: 0.918888
[800]	valid's auc: 0.919086
[900]	valid's auc: 0.919253
[1000]	valid's auc: 0.919345
[1100]	valid's auc: 0.919452
[1200]	valid's auc: 0.919551
[1300]	valid's auc: 0.919666
[1400]	valid's auc: 0.919764
[1500]	valid's auc: 0.919837
[1600]	valid's auc: 0.919901
[1700]	valid's auc: 0.919978
[1800]	valid's auc: 0.920052
[1900]	valid's auc: 0.920096
[2000]	valid's auc: 0.920168
[2100]	valid's auc: 0.920225
[2200]	valid's auc: 0.920277
[2300]	valid's auc: 0.920318
[2400]	valid's auc: 0.920379
[2500]	valid's auc: 0.920421
[2600]	valid's auc: 0.920465
[2700]	valid's auc: 0.920501
[2800]	valid's auc: 0.920543
[2900]	valid's auc: 0.92058
[3000]	valid's auc: 0.920634
[3100]	valid's auc: 0.920657
[3200]	valid's auc: 0.920699
Early stopping, best 

[500]	valid's auc: 0.915422
[600]	valid's auc: 0.915494
Early stopping, best iteration is:
[595]	valid's auc: 0.915531
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.914307
[200]	valid's auc: 0.914589
[300]	valid's auc: 0.914793
[400]	valid's auc: 0.914969
[500]	valid's auc: 0.915093
[600]	valid's auc: 0.915184
[700]	valid's auc: 0.915266
[800]	valid's auc: 0.915333
Early stopping, best iteration is:
[869]	valid's auc: 0.915388
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.914522
[200]	valid's auc: 0.915014
[300]	valid's auc: 0.915335
[400]	valid's auc: 0.915532
[500]	valid's auc: 0.915709
[600]	valid's auc: 0.915825
[700]	valid's auc: 0.916016
Early stopping, best iteration is:
[765]	valid's auc: 0.916074
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.911153
Early stopping, best iteration is:
[161]	valid's auc: 0.911453
Training until validation scores don't improve for 30 rounds


[300]	valid's auc: 0.91783
[400]	valid's auc: 0.918226
[500]	valid's auc: 0.918886
Early stopping, best iteration is:
[551]	valid's auc: 0.919115
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.910952
[200]	valid's auc: 0.912135
Early stopping, best iteration is:
[227]	valid's auc: 0.912262
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.911195
[200]	valid's auc: 0.912001
[300]	valid's auc: 0.912432
Early stopping, best iteration is:
[350]	valid's auc: 0.91259
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.910407
[200]	valid's auc: 0.911332
Early stopping, best iteration is:
[265]	valid's auc: 0.9115
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.916163
[200]	valid's auc: 0.91734
[300]	valid's auc: 0.917613
[400]	valid's auc: 0.917688
[500]	valid's auc: 0.917761
[600]	valid's auc: 0.91781
[700]	valid's auc: 0.917865
[800]	valid's auc: 0.9179
[900]	va

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  5.0min finished


Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.922665
[200]	valid's auc: 0.925268
[300]	valid's auc: 0.926173
[400]	valid's auc: 0.927112
[500]	valid's auc: 0.927544
Early stopping, best iteration is:
[550]	valid's auc: 0.927737
Best score reached: 0.8064773837609197 with params: {'colsample_bytree': 0.7173613575636579, 'min_child_samples': 134, 'min_child_weight': 1e-05, 'num_leaves': 14, 'reg_alpha': 0, 'reg_lambda': 20, 'subsample': 0.5813098915332424} 


In [191]:
opt_parameters = {'colsample_bytree': 0.7173613575636579, 
                  'min_child_samples': 134, 
                  'min_child_weight': 1e-05, 
                  'num_leaves': 14, 
                  'reg_alpha': 0, 
                  'reg_lambda': 20, 
                  'subsample': 0.5813098915332424} 

#{'colsample_bytree': 0.9234, 'min_child_samples': 399, 'min_child_weight': 0.1, 'num_leaves': 13, 'reg_alpha': 2, 'reg_lambda': 5, 'subsample': 0.855}

### Tune the weights of unbalanced classes

In [169]:
clf_sw = lgb.LGBMClassifier(**clf.get_params())
#set optimal parameters
clf_sw.set_params(**opt_parameters)

LGBMClassifier(colsample_bytree=0.7173613575636579, metric='None',
               min_child_samples=134, min_child_weight=1e-05, n_estimators=5000,
               n_jobs=4, num_leaves=14, random_state=314, reg_alpha=0,
               reg_lambda=20, subsample=0.5813098915332424)

In [182]:
gs_sample_weight = GridSearchCV(estimator=clf_sw, 
                                param_grid={'scale_pos_weight':[1, 2, 6, 12]},
                                scoring='f1',
                                cv=5,
                                refit=True,
                                verbose=True)

In [171]:
gs_sample_weight.fit(tmp_x_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs_sample_weight.best_score_, gs_sample_weight.best_params_))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Training until validation scores don't improve for 30 rounds


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[100]	valid's auc: 0.922348
[200]	valid's auc: 0.924951
[300]	valid's auc: 0.925594
[400]	valid's auc: 0.926196
[500]	valid's auc: 0.926426
Early stopping, best iteration is:
[517]	valid's auc: 0.926453
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.922415
[200]	valid's auc: 0.924855
[300]	valid's auc: 0.925498
[400]	valid's auc: 0.925857
[500]	valid's auc: 0.926217
[600]	valid's auc: 0.926336
Early stopping, best iteration is:
[588]	valid's auc: 0.9264
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.921934
[200]	valid's auc: 0.924281
[300]	valid's auc: 0.925527
Early stopping, best iteration is:
[364]	valid's auc: 0.925908
Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.922598
[200]	valid's auc: 0.924977
[300]	valid's auc: 0.926078
[400]	valid's auc: 0.926579
Early stopping, best iteration is:
[413]	valid's auc: 0.926631
Training until validation scores don't improve for 30 rounds
[1

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   18.6s finished


[100]	valid's auc: 0.923868
[200]	valid's auc: 0.926333
[300]	valid's auc: 0.927187
Early stopping, best iteration is:
[367]	valid's auc: 0.927437
Best score reached: 0.7211768617287371 with params: {'scale_pos_weight': 2} 


### Build final model

In [199]:
#Configure from the HP optimisation
clf_final = lgb.LGBMClassifier(**gs.best_estimator_.get_params())

#Configure locally from hardcoded values
#clf_final = lgb.LGBMClassifier(**clf.get_params())

#set optimal parameters
clf_final.set_params(**opt_parameters)

#Train the final model with learning rate decay
clf_final.fit(tmp_x_train, y_train, **fit_params, callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)])

Training until validation scores don't improve for 30 rounds
[100]	valid's auc: 0.921265
[200]	valid's auc: 0.923487
[300]	valid's auc: 0.924268
[400]	valid's auc: 0.92454
[500]	valid's auc: 0.924644
[600]	valid's auc: 0.924709
[700]	valid's auc: 0.924761
Early stopping, best iteration is:
[752]	valid's auc: 0.924772


LGBMClassifier(colsample_bytree=0.7173613575636579, metric='None',
               min_child_samples=134, min_child_weight=1e-05, n_estimators=5000,
               n_jobs=4, num_leaves=14, random_state=314, reg_alpha=0,
               reg_lambda=20, subsample=0.5813098915332424)

In [200]:
qq = clf_final.predict(tmp_x_valid)
print(f"LightGBM F1 Score: {f1_score(y_valid, qq, average='micro')}")
#0.86

LightGBM F1 Score: 0.8661931272797082


In [130]:
preds3 = clf_final.predict(x_test)
sample_submission['prediction'] = preds3
sample_submission.to_csv('submission_pred3.csv', index=False)

# 0.86

# 베이지안 파라미터 옵티마이제이션

In [135]:
#building models
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
import time
import sys

#tuning hyperparameters
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 

#metrics 
from sklearn.metrics import roc_auc_score, roc_curve
import shap

In [136]:
from sklearn.metrics import f1_score

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

In [138]:
def bayes_parameter_opt_lgb(X, y, init_round=20, opt_round=30, n_folds=5, random_seed=6, n_estimators=10000,
                            learning_rate=0.05, output_process=False):
    # prepare data

    train_data = lgb.Dataset(data=X, label=y)
    # parameters

    def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, 
                 lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        
        global cv_result

        params = {'objective':'binary','num_iterations':1000, 'learning_rate':0.05,
                  'early_stopping_round':100, 'metric':'binary_logloss'} #rmse
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        
        cv_result = lgb.cv(params, train_data, nfold=3, seed=random_seed,
                           stratified=False, verbose_eval =200, metrics=['binary_logloss']) #rmse

        return min(cv_result['binary_logloss-mean']) 

    # setting range of the parameters
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.5, 1),
                                            'max_depth': (5, 8.99),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'min_split_gain': (0.001, 0.1),
                                            'min_child_weight': (5, 60)}, random_state=0)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    # output optimization process
    if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")
    
    # return
    return lgbBO

opt_params = bayes_parameter_opt_lgb(tmp_x_train, y_train, init_round=5, opt_round=10, n_folds=3,
                                     random_seed=6, n_estimators=10000, learning_rate=0.05)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
[200]	cv_agg's binary_logloss: 0.305371 + 0.00716862
[400]	cv_agg's binary_logloss: 0.303903 + 0.00741055
[600]	cv_agg's binary_logloss: 0.303098 + 0.00741458
[800]	cv_agg's binary_logloss: 0.302444 + 0.00740958
[1000]	cv_agg's binary_logloss: 0.301924 + 0.00739532
| [0m 1       [0m | [0m 0.3019  [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 3.014   [0m | [0m 1.635   [0m | [0m 6.69    [0m | [0m 40.52   [0m | [0m 0.04432 [0m | [0m 42.73   [0m |
[200]	cv_agg's binary_logloss: 0.310054 + 0.00746455
[400]	cv_agg's binary_logloss: 0.308554 + 0.00785947
[600]	cv_agg's binary_logloss: 0.30809 + 0.00787684
[800]	cv_agg's binary_logloss: 0.307634 + 0.00787019
[1000]	cv_agg's binary_logloss: 0.30726 + 0.00786974
| [95m 2       [0m | 

In [139]:
params = opt_params.max['params']

In [140]:
params

{'bagging_fraction': 0.5,
 'feature_fraction': 0.1,
 'lambda_l1': 5.0,
 'lambda_l2': 3.0,
 'max_depth': 5.0,
 'min_child_weight': 60.0,
 'min_split_gain': 0.1,
 'num_leaves': 24.506343212578575}

In [141]:
params = {
    "objective" : "binary",
    "metric" : "binary_logloss", # rmse
    "bagging_frequency" : 5,
    "bagging_seed" : 2018,
    "verbosity" : -1,

    # Selected rounded-off params
    'bagging_fraction': 0.5,
    'feature_fraction': 0.1,
    'lambda_l1': 5.0,
    'lambda_l2': 3.0,
    'max_depth': 5,
    'min_child_weight': 60.0,
    'min_split_gain': 0.1,
    'num_leaves': 25
}

In [26]:
params = {
    "objective" : "binary",
    "metric" : "binary_logloss",
    "bagging_frequency" : 5,
    "bagging_seed" : 2018,
    "verbosity" : -1,

    # Selected rounded-off params
    'bagging_fraction': 0.5,
    'feature_fraction': 0.1,
    'lambda_l1': 5.0,
    'lambda_l2': 3.0,
    'max_depth': 5,
    'min_child_weight': 60.0,
    'min_split_gain': 0.1,
    'num_leaves': 45
}

In [None]:
#train_test_split 
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=0)
#converting the dataset into proper LGB format 
d_train=lgb.Dataset(X_train, label=y_train)
#Specifying the parameter
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='binary' #Binary target feature
params['metric']='binary_logloss' #metric for binary classification
params['max_depth']=10
#train the model 
clf=lgb.train(params,d_train,100) #train the model on 100 epocs
#prediction on the test set
y_pred=clf.predict(X_test)

In [147]:
import lightgbm as lgb
d_train = lgb.Dataset(tmp_x_train, label=y_train)
d_test = lgb.Dataset(tmp_x_valid, label=y_valid)
clf = lgb.train(params, d_train, 1000, d_test, verbose_eval=100, early_stopping_rounds=100)

y_pred = clf.predict(x_valid)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.322877
[200]	valid_0's binary_logloss: 0.305375
[300]	valid_0's binary_logloss: 0.301182
[400]	valid_0's binary_logloss: 0.300576
[500]	valid_0's binary_logloss: 0.300269
[600]	valid_0's binary_logloss: 0.300054
[700]	valid_0's binary_logloss: 0.299816
[800]	valid_0's binary_logloss: 0.299639
[900]	valid_0's binary_logloss: 0.299427
[1000]	valid_0's binary_logloss: 0.299261
Did not meet early stopping. Best iteration is:
[997]	valid_0's binary_logloss: 0.299261


In [148]:
#rounding the values
y_pred=y_pred.round(0)
#converting from float to integer
y_pred=y_pred.astype(int)
f1 = f1_score(y_valid, y_pred, average='micro')
print('-F1 Score: ', f1)

-F1 Score:  0.8608178153196391


In [153]:
y_pred3 = clf.predict(x_test)
#rounding the values
y_pred3=y_pred3.round(0)
#converting from float to integer
y_pred3=y_pred3.astype(int)

In [155]:
sample_submission['prediction'] = y_pred3
sample_submission.to_csv('submission_pred4.csv', index=False)

# 이거 아직 제출 못해봄