In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier




import os

In [2]:
# 데이터 불러오기
train = pd.read_csv('/kaggle/input/kakr-4th-competition/train.csv')
test = pd.read_csv('/kaggle/input/kakr-4th-competition/test.csv')
sample_submission = pd.read_csv('../input/kakr-4th-competition/sample_submission.csv')

label = train['income']

del train['income']

In [3]:
# 라벨 값 인코딩
label = label.map(lambda x: 1 if x == '>50K' else 0)

In [4]:
x_train, x_valid, y_train, y_valid = train_test_split(train, label, 
                                                          test_size=0.2,
                                                          random_state=20,
                                                          shuffle=True)

In [14]:
def preprocess(x_train, x_valid, x_test):
    
    global tmp_x_train
    global tmp_x_valid
    global tmp_x_test
    
    tmp_x_train = x_train.copy()
    tmp_x_valid = x_valid.copy()
    tmp_x_test  = x_test.copy()
    
    tmp_x_train = tmp_x_train.reset_index(drop=True)
    tmp_x_valid = tmp_x_valid.reset_index(drop=True)
    tmp_x_test  = tmp_x_test.reset_index(drop=True)
    
    # column 제거
    tmp_x_train.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    tmp_x_valid.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    tmp_x_test.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    
    # marital status
    tmp_x_train['marital_status'] = (tmp_x_train['marital_status'] == 'Married-civ-spouse').astype(int)
    tmp_x_valid['marital_status'] = (tmp_x_valid['marital_status'] == 'Married-civ-spouse').astype(int)
    tmp_x_test['marital_status'] = (tmp_x_test['marital_status'] == 'Married-civ-spouse').astype(int)
    
    # race
    tmp_x_train['race'] = ((tmp_x_train['race'] == 'White') | (tmp_x_train['race'] == 'Asian-Pac-Islander')).astype(int)
    tmp_x_valid['race'] = ((tmp_x_valid['race'] == 'White') | (tmp_x_valid['race'] == 'Asian-Pac-Islander')).astype(int)
    tmp_x_test['race'] = ((tmp_x_test['race'] == 'White') | (tmp_x_test['race'] == 'Asian-Pac-Islander')).astype(int)
    
    # capital_gain, loss
    tmp_x_train['cap_gain_high'] = (tmp_x_train['capital_gain'] != 0).astype(int)
    tmp_x_train['cap_loss_high'] = (tmp_x_train['capital_loss'] >= 1700).astype(int)
    tmp_x_train['capital_gain'] = tmp_x_train['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    tmp_x_valid['cap_gain_high'] = (tmp_x_valid['capital_gain'] != 0).astype(int)
    tmp_x_valid['cap_loss_high'] = (tmp_x_valid['capital_loss'] >= 1700).astype(int)
    tmp_x_valid['capital_gain'] = tmp_x_valid['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    tmp_x_test['cap_gain_high'] = (tmp_x_test['capital_gain'] != 0).astype(int)
    tmp_x_test['cap_loss_high'] = (tmp_x_test['capital_loss'] >= 1700).astype(int)
    tmp_x_test['capital_gain'] = tmp_x_test['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    
    # age
    tmp_x_train.loc[tmp_x_train['age'] < 20, 'age_range'] = '~20'
    tmp_x_train.loc[tmp_x_train['age'] >= 65, 'age_range'] = '~65'
    down = 20
    for i in range(45//5):
        tmp_x_train.loc[(tmp_x_train['age'] >= down) & (tmp_x_train['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5
    tmp_x_train['age'] = tmp_x_train['age_range']
    tmp_x_train.drop(['age_range'], axis=1, inplace=True)
    
    tmp_x_valid.loc[tmp_x_valid['age'] < 20, 'age_range'] = '~20'
    tmp_x_valid.loc[tmp_x_valid['age'] >= 65, 'age_range'] = '~65'
    down = 20
    for i in range(45//5):
        tmp_x_valid.loc[(tmp_x_valid['age'] >= down) & (tmp_x_valid['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5
    tmp_x_valid['age'] = tmp_x_valid['age_range']
    tmp_x_valid.drop(['age_range'], axis=1, inplace=True)
    
    tmp_x_test.loc[tmp_x_test['age'] < 20, 'age_range'] = '~20'
    tmp_x_test.loc[tmp_x_test['age'] >= 65, 'age_range'] = '~65'
    down = 20
    for i in range(45//5):
        tmp_x_test.loc[(tmp_x_test['age'] >= down) & (tmp_x_test['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5
    tmp_x_test['age'] = tmp_x_test['age_range']
    tmp_x_test.drop(['age_range'], axis=1, inplace=True)
        
    # edu_num
    tmp_x_train['edu_num_high'] = (tmp_x_train['education_num'] >= 13).astype(int)
    tmp_x_valid['edu_num_high'] = (tmp_x_valid['education_num'] >= 13).astype(int)
    tmp_x_test['edu_num_high'] = (tmp_x_test['education_num'] >= 13).astype(int)
    
    # hours-per-week
    tmp_x_train['hpw_high'] = (tmp_x_train['hours_per_week'] >= 50).astype(int)
    tmp_x_valid['hpw_high'] = (tmp_x_valid['hours_per_week'] >= 50).astype(int)
    tmp_x_test['hpw_high'] = (tmp_x_test['hours_per_week'] >= 50).astype(int)
    
    # min-max scaler
    mmscaler = MinMaxScaler()
    tmp_x_train['education_num'] = mmscaler.fit_transform(tmp_x_train['education_num'].values.reshape(-1,1))
    tmp_x_valid['education_num'] = mmscaler.transform(tmp_x_valid['education_num'].values.reshape(-1,1))
    tmp_x_test['education_num'] = mmscaler.transform(tmp_x_test['education_num'].values.reshape(-1,1))
    
    tmp_x_train['hours_per_week'] = mmscaler.fit_transform(tmp_x_train['hours_per_week'].values.reshape(-1,1))
    tmp_x_valid['hours_per_week'] = mmscaler.transform(tmp_x_valid['hours_per_week'].values.reshape(-1,1))
    tmp_x_test['hours_per_week'] = mmscaler.transform(tmp_x_test['hours_per_week'].values.reshape(-1,1))

    
    # ohe
    tmp_all = pd.concat([tmp_x_train, tmp_x_valid, tmp_x_test])
    
    ohe = OneHotEncoder(sparse=False)
    cat_columns = ['age', 'marital_status', 'occupation', 'race', 'sex']
    ohe.fit(tmp_all[cat_columns])
    
    
    ohe_columns = list()
    for lst in ohe.categories_:
        ohe_columns += lst.tolist()
    
    tmp_train_cat = pd.DataFrame(ohe.transform(tmp_x_train[cat_columns]), columns=ohe_columns)
    tmp_valid_cat = pd.DataFrame(ohe.transform(tmp_x_valid[cat_columns]), columns=ohe_columns)
    tmp_test_cat  = pd.DataFrame(ohe.transform(tmp_x_test[cat_columns]), columns=ohe_columns)
    
    tmp_train_cat.columns = ohe.get_feature_names(cat_columns)
    tmp_valid_cat.columns = ohe.get_feature_names(cat_columns)
    tmp_test_cat.columns = ohe.get_feature_names(cat_columns)
    
    tmp_x_train = pd.concat([tmp_x_train, tmp_train_cat], axis=1)
    tmp_x_valid = pd.concat([tmp_x_valid, tmp_valid_cat], axis=1)
    tmp_x_test = pd.concat([tmp_x_test, tmp_test_cat], axis=1)

    tmp_x_train = tmp_x_train.drop(columns=cat_columns)
    tmp_x_valid = tmp_x_valid.drop(columns=cat_columns)
    tmp_x_test = tmp_x_test.drop(columns=cat_columns)
        
#     # get_dummies
#     tmp_x_train = pd.get_dummies(tmp_x_train, columns = ['age', 'marital_status', 'occupation', 'race', 'sex'])
#     tmp_x_valid = pd.get_dummies(tmp_x_valid, columns = ['age', 'marital_status', 'occupation', 'race', 'sex'])
#     tmp_x_test = pd.get_dummies(tmp_x_test, columns = ['age', 'marital_status', 'occupation', 'race', 'sex'])
    
    
    return tmp_x_train.values, tmp_x_valid.values, tmp_x_test.values

In [7]:
tmp_x_train

NameError: name 'tmp_x_train' is not defined

In [15]:
preprocess(x_train, x_valid, test)

(array([[0.66666667, 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ],
        [0.53333333, 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ],
        [0.53333333, 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ],
        ...,
        [0.8       , 0.        , 0.        , ..., 1.        , 1.        ,
         0.        ],
        [0.8       , 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ],
        [0.53333333, 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ]]),
 array([[0.53333333, 0.        , 0.        , ..., 1.        , 1.        ,
         0.        ],
        [0.86666667, 0.        , 0.        , ..., 0.        , 0.        ,
         1.        ],
        [0.6       , 0.        , 0.        , ..., 1.        , 0.        ,
         1.        ],
        ...,
        [0.53333333, 0.        , 0.        , ..., 1.        , 1.        ,
         0.        ],
        [0.8

In [16]:
tmp_x_train.describe()

Unnamed: 0,education_num,capital_gain,capital_loss,hours_per_week,cap_gain_high,cap_loss_high,edu_num_high,hpw_high,age_20~25,age_25~30,...,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,race_0,race_1,sex_Female,sex_Male
count,20839.0,20839.0,20839.0,20839.0,20839.0,20839.0,20839.0,20839.0,20839.0,20839.0,...,20839.0,20839.0,20839.0,20839.0,20839.0,20839.0,20839.0,20839.0,20839.0,20839.0
mean,0.605579,0.733625,88.000432,0.402335,0.083113,0.034503,0.248332,0.200681,0.123518,0.129517,...,0.004943,0.126302,0.019387,0.116368,0.028264,0.049187,0.113201,0.886799,0.332598,0.667402
std,0.170939,2.454261,403.795858,0.126457,0.27606,0.182521,0.432056,0.40052,0.329039,0.335779,...,0.070132,0.332197,0.137883,0.320674,0.165731,0.216263,0.316846,0.316846,0.471155,0.471155
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.533333,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,0.6,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,0.733333,0.0,0.0,0.44898,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
max,1.0,11.512915,4356.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
tmp_x_test.describe().T

AttributeError: 'DataFrame' object has no attribute 't'

# 일단 k-fold 한번 돌려보고


In [8]:
from sklearn.model_selection import StratifiedKFold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)

In [9]:
val_scores = list()
oof_pred = np.zeros((test.shape[0],))

for i, (trn_idx, val_idx) in enumerate(skf.split(train, label)):
    x_train, y_train = train.iloc[trn_idx, :], label[trn_idx]
    x_valid, y_valid = train.iloc[val_idx, :], label[val_idx]
    
    # 전처리
    x_train, x_valid, x_test = preprocess(x_train, x_valid, test)
    
    # 모델 정의
    clf = LGBMClassifier()
    
    # 모델 학습
    clf.fit(x_train, y_train,
            eval_set = [[x_valid, y_valid]], 
            eval_metric = 'logloss',        
            early_stopping_rounds = 100,
            verbose = 100,  )

    # 훈련, 검증 데이터 Log Loss 확인
    trn_f1_score = f1_score(y_train, clf.predict(x_train), average='micro')
    val_f1_score = f1_score(y_valid, clf.predict(x_valid), average='micro')
    print('{} Fold, train f1_score : {:.4f}4, validation f1_score : {:.4f}\n'.format(i, trn_f1_score, val_f1_score))
    
    val_scores.append(val_f1_score)
    

# 교차 검증 F1 Score 평균 계산하기
print('Cross Validation Score : {:.4f}'.format(np.mean(val_scores)))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.280645
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.280645
0 Fold, train f1_score : 0.87984, validation f1_score : 0.8695

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.28152
Did not meet early stopping. Best iteration is:
[99]	valid_0's binary_logloss: 0.281508
1 Fold, train f1_score : 0.88144, validation f1_score : 0.8670

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.285252
Did not meet early stopping. Best iteration is:
[95]	valid_0's binary_logloss: 0.285136
2 Fold, train f1_score : 0.88094, validation f1_score : 0.8651

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.283011
Did not meet early stopping. Best iteration is:
[98]	valid_0's binary_logloss: 0.282902
3 Fold, train f1_score : 0.88134, validation

In [10]:
a1 = clf.predict(tmp_x_valid)
print(f"LightGBM F1 Score: {f1_score(y_valid, a1, average='micro')}")

LightGBM F1 Score: 0.87022461124976


### oof

In [11]:
# logistic regression

val_scores = list()
oof_pred = np.zeros((test.shape[0], )) # 이것도 달라짐

for i, (trn_idx, val_idx) in enumerate(skf.split(train, label)):
    x_train, y_train = train.iloc[trn_idx, :], label[trn_idx]
    x_valid, y_valid = train.iloc[val_idx, :], label[val_idx]
    
    # 전처리
    x_train, x_valid, x_test = preprocess(x_train, x_valid, test)
    
    # 모델 정의
    clf = LogisticRegression()
    
    # 모델 학습
    clf.fit(x_train, y_train)

    # 훈련, 검증 데이터 F1 Score 확인
    trn_f1_score = f1_score(y_train, clf.predict(x_train), average='micro')
    val_f1_score = f1_score(y_valid, clf.predict(x_valid), average='micro')
    print('{} Fold, train f1_score : {:.4f}4, validation f1_score : {:.4f}\n'.format(i, trn_f1_score, val_f1_score))
    
    val_scores.append(val_f1_score)
    
    oof_pred += clf.predict_proba(x_test)[: , 1] / n_splits # 이게 달라진거임
    

# 교차 검증 F1 Score 평균 계산하기
print('Cross Validation Score : {:.4f}'.format(np.mean(val_scores)))

0 Fold, train f1_score : 0.84424, validation f1_score : 0.8468

1 Fold, train f1_score : 0.84584, validation f1_score : 0.8428

2 Fold, train f1_score : 0.84424, validation f1_score : 0.8440

3 Fold, train f1_score : 0.84574, validation f1_score : 0.8438

4 Fold, train f1_score : 0.84434, validation f1_score : 0.8472

Cross Validation Score : 0.8449


In [13]:
oof_pred

array([0.0143961 , 0.60567547, 0.00367895, ..., 0.09177444, 0.267417  ,
       0.00633541])

### stacking

In [12]:
val_scores = list()

new_x_train_list = [np.zeros((train.shape[0], 1)) for _ in range(4)]
new_x_test_list  = [np.zeros((test.shape[0], 1)) for _ in range(4)]

for i, (trn_idx, val_idx) in enumerate(skf.split(train, label)):
    print(f"Fold {i} Start")
    x_train, y_train = train.iloc[trn_idx, :], label[trn_idx]
    x_valid, y_valid = train.iloc[val_idx, :], label[val_idx]
    
    # 전처리
    x_train, x_valid, x_test = preprocess(x_train, x_valid, test)
    
    # 모델 정의
    clfs = [LGBMClassifier()] # 추가 모델 입력
    
    for model_idx, clf in enumerate(clfs):
        clf.fit(x_train, y_train)
        
        new_x_train_list[model_idx][val_idx, :] = clf.predict_proba(x_valid)[:, 1].reshape(-1, 1)
        new_x_test_list[model_idx][:] += clf.predict_proba(x_test)[:, 1].reshape(-1, 1) / n_splits

Fold 0 Start
Fold 1 Start
Fold 2 Start
Fold 3 Start
Fold 4 Start


In [15]:
new_train = pd.DataFrame(np.concatenate(new_x_train_list, axis=1), columns=None)
new_label = label
new_test = pd.DataFrame(np.concatenate(new_x_test_list, axis=1), columns=None)

new_train.shape, new_label.shape, new_test.shape

((26049, 4), (26049,), (6512, 4))

In [19]:
def xgb_f1(y, t, threshold=0.5):
    t = t.get_label()
    y_bin = (y > threshold).astype(int) 
    return 'f1', f1_score(t, y_bin, average='micro')

In [21]:
val_scores = list()
oof_pred = np.zeros((test.shape[0], ))

for i, (trn_idx, val_idx) in enumerate(skf.split(new_train, new_label)):
    x_train, y_train = new_train.iloc[trn_idx, :], new_label[trn_idx]
    x_valid, y_valid = new_train.iloc[val_idx, :], new_label[val_idx]
    
    # 전처리
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)
    x_test  = scaler.transform(new_test)
    
    # 모델 정의
    clf = LGBMClassifier()
    #clf = XGBClassifier(tree_method='gpu_hist')
    
    # 모델 학습
    clf.fit(x_train, y_train,
            eval_set = [[x_valid, y_valid]], 
            eval_metric = 'logloss',        # 모델에 따라 판단 (xgbosst : xgb_f1)
            early_stopping_rounds = 100,
            verbose = 100,  )

    # 훈련, 검증 데이터 F1 Score 확인
    trn_f1_score = f1_score(y_train, clf.predict(x_train), average='micro')
    val_f1_score = f1_score(y_valid, clf.predict(x_valid), average='micro')
    print('{} Fold, train f1_score : {:.4f}4, validation f1_score : {:.4f}\n'.format(i, trn_f1_score, val_f1_score))
    
    val_scores.append(val_f1_score)
    
    oof_pred += clf.predict_proba(x_test)[:, 1] / n_splits
    

# 교차 검증 F1 Score 평균 계산하기
print('Cross Validation Score : {:.4f}'.format(np.mean(val_scores)))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.284819
Did not meet early stopping. Best iteration is:
[50]	valid_0's binary_logloss: 0.282398
0 Fold, train f1_score : 0.86854, validation f1_score : 0.8701

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.283984
Did not meet early stopping. Best iteration is:
[55]	valid_0's binary_logloss: 0.282714
1 Fold, train f1_score : 0.86854, validation f1_score : 0.8660

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.287994
Did not meet early stopping. Best iteration is:
[52]	valid_0's binary_logloss: 0.286446
2 Fold, train f1_score : 0.86914, validation f1_score : 0.8641

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.28799
Did not meet early stopping. Best iteration is:
[49]	valid_0's binary_logloss: 0.285979
3 Fold, train f1_score : 0.86924, validation 

In [None]:
# 이거 되는지 확인

In [22]:
#building models
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
import time
import sys

#tuning hyperparameters
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 

#metrics 
from sklearn.metrics import roc_auc_score, roc_curve
import shap

In [23]:
from sklearn.metrics import f1_score

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

In [24]:
def bayes_parameter_opt_lgb(X, y, init_round=20, opt_round=30, n_folds=5, random_seed=6, n_estimators=10000,
                            learning_rate=0.05, output_process=False):
    # prepare data

    train_data = lgb.Dataset(data=X, label=y)
    # parameters

    def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, 
                 lambda_l1, lambda_l2, min_split_gain, min_child_weight):
        
        global cv_result

        params = {'objective':'binary','num_iterations':1000, 'learning_rate':0.05,
                  'early_stopping_round':100, 'metric':'binary_logloss'} #rmse
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        
        cv_result = lgb.cv(params, train_data, nfold=3, seed=random_seed,
                           stratified=False, verbose_eval =200, metrics=['binary_logloss']) #rmse

        return min(cv_result['binary_logloss-mean']) #  여기서 에러...

    # setting range of the parameters
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.5, 1),
                                            'max_depth': (5, 8.99),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'min_split_gain': (0.001, 0.1),
                                            'min_child_weight': (5, 60)}, random_state=0)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    # output optimization process
    if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")
    
    # return
    return lgbBO

opt_params = bayes_parameter_opt_lgb(tmp_x_train, y_train, init_round=5, opt_round=10, n_folds=3,
                                     random_seed=6, n_estimators=10000, learning_rate=0.05)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
[200]	cv_agg's binary_logloss: 0.305371 + 0.00716862
[400]	cv_agg's binary_logloss: 0.303903 + 0.00741055
[600]	cv_agg's binary_logloss: 0.303098 + 0.00741458
[800]	cv_agg's binary_logloss: 0.302444 + 0.00740958
[1000]	cv_agg's binary_logloss: 0.301924 + 0.00739532
| [0m 1       [0m | [0m 0.3019  [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 3.014   [0m | [0m 1.635   [0m | [0m 6.69    [0m | [0m 40.52   [0m | [0m 0.04432 [0m | [0m 42.73   [0m |
[200]	cv_agg's binary_logloss: 0.310054 + 0.00746455
[400]	cv_agg's binary_logloss: 0.308554 + 0.00785947
[600]	cv_agg's binary_logloss: 0.30809 + 0.00787684
[800]	cv_agg's binary_logloss: 0.307634 + 0.00787019
[1000]	cv_agg's binary_logloss: 0.30726 + 0.00786974
| [95m 2       [0m | 

In [25]:
params = opt_params.max['params']

In [26]:
params

{'bagging_fraction': 0.5,
 'feature_fraction': 0.1,
 'lambda_l1': 5.0,
 'lambda_l2': 3.0,
 'max_depth': 8.99,
 'min_child_weight': 60.0,
 'min_split_gain': 0.1,
 'num_leaves': 24.0}

In [30]:
params = {
    "objective" : "binary",
    "metric" : "binary_logloss", 
    "bagging_frequency" : 5,
    "bagging_seed" : 2018,
    "verbosity" : -1,

    # Selected rounded-off params
    'bagging_fraction': 0.5,
    'feature_fraction': 0.1,
    'lambda_l1': 5.0,
    'lambda_l2': 3.0,
    'max_depth': 9,
    'min_child_weight': 60.0,
    'min_split_gain': 0.1,
    'num_leaves': 24
}

In [31]:
import lightgbm as lgb
d_train = lgb.Dataset(tmp_x_train, label=y_train)
d_test = lgb.Dataset(tmp_x_valid, label=y_valid)
clf = lgb.train(params, d_train, 1000, d_test, verbose_eval=100, early_stopping_rounds=100)

y_pred = clf.predict(x_valid)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.322878
[200]	valid_0's binary_logloss: 0.305433
[300]	valid_0's binary_logloss: 0.301195
[400]	valid_0's binary_logloss: 0.300539
[500]	valid_0's binary_logloss: 0.300222
[600]	valid_0's binary_logloss: 0.300006
[700]	valid_0's binary_logloss: 0.299766
[800]	valid_0's binary_logloss: 0.299593
[900]	valid_0's binary_logloss: 0.299381
[1000]	valid_0's binary_logloss: 0.299214
Did not meet early stopping. Best iteration is:
[997]	valid_0's binary_logloss: 0.299214


LightGBMError: The number of features in data (4) is not the same as it was in training data (40).

In [33]:
import sklearn.model_selection as model_selection

In [36]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, 
                      verbose_eval=100, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

pred_test = 0
kf = model_selection.KFold(n_splits=5, random_state=2018, shuffle=True)
for dev_index, val_index in kf.split(train):
    
        # 전처리
    x_train, x_valid, x_test = preprocess(x_train, x_valid, test)

    dev_X, val_X = x_train.loc[dev_index,:], x_train.loc[val_index,:]
    dev_y, val_y = x_valid[dev_index], x_valid[val_index]
    
    pred_test_tmp, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, test_X)
    pred_test += pred_test_tmp
pred_test /= 5.

AttributeError: 'numpy.ndarray' object has no attribute 'reset_index'