In [19]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier 
import lightgbm as lgb

from sklearn.metrics import roc_auc_score

In [23]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [24]:
target = train['target']
del train['target']

test_id = test['ID_code']

train.drop('ID_code',axis=1 , inplace=True)
test.drop('ID_code',axis=1 , inplace=True)

In [12]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [21]:
# frolds 지정
num_folds = 5
#.columns를 통해 변수 지정
features = [c for c in train.columns ]

#계층별 kfold
folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=2319)

oof = np.zeros(len(train))
getVal = np.zeros(len(train))
predictions = np.zeros(len(target))
feature_importance_df = pd.DataFrame()

print('Light GBM Model')
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    
    X_train, y_train = train.iloc[trn_idx][features], target.iloc[trn_idx]
    X_valid, y_valid = train.iloc[val_idx][features], target.iloc[val_idx]
    
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    
    clf = lgb.train(param, trn_data, 10000 ,valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    getVal[val_idx]+= clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Light GBM Model
Fold idx:1
Training until validation scores don't improve for 4000 rounds.
[1000]	training's auc: 0.902435	valid_1's auc: 0.883856
[2000]	training's auc: 0.912589	valid_1's auc: 0.89053
[3000]	training's auc: 0.919077	valid_1's auc: 0.893948
[4000]	training's auc: 0.924001	valid_1's auc: 0.89606
[5000]	training's auc: 0.928194	valid_1's auc: 0.89724
[6000]	training's auc: 0.932003	valid_1's auc: 0.89817
[7000]	training's auc: 0.935468	valid_1's auc: 0.89872
[8000]	training's auc: 0.938761	valid_1's auc: 0.89904
[9000]	training's auc: 0.941942	valid_1's auc: 0.899227
[10000]	training's auc: 0.944957	valid_1's auc: 0.899369
Did not meet early stopping. Best iteration is:
[10000]	training's auc: 0.944957	valid_1's auc: 0.899369
Fold idx:2
Training until validation scores don't improve for 4000 rounds.
[1000]	training's auc: 0.903164	valid_1's auc: 0.879145
[2000]	training's auc: 0.913339	valid_1's auc: 0.886907
[3000]	training's auc: 0.919664	valid_1's auc: 0.891203
[4000]

In [25]:
##submission
sub_df1 = pd.DataFrame({"ID_code":test_id})
sub_df1["target"] = predictions
sub_df1.to_csv("lgboost_oof.csv", index=False)