In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier 
import lightgbm as lgb

from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
target = train['target']
del train['target']

test_id = test['ID_code']

train.drop('ID_code',axis=1 , inplace=True)
test.drop('ID_code',axis=1 , inplace=True)

In [8]:
def augment(x,y,t=2):
    xs,xn = [],[]
    #두번반복한다는이야기
    for i in range(t):
        #y==1 일경우
        mask = y>0
        x1 = x[mask].copy()
        #x1의 인덱스 추출
        ids = np.arange(x1.shape[0])
        #x1의 컬럼 수만큼 for roop 하면서 인덱스를 섞어주고 섞어줄떄마다 컬럼 하나씩 붙인다.
        # 전체로우에 인덱스를 계속 바꾸면서 컬럼을 1,2,3.... 하나씩 넣어준다
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)
    #한번 반복한다는 이야기
    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)
    #Must pass 2-d input  2d가 아니기 떄문에 vstack로 쌓아준다
    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [4]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [9]:
# frolds 지정
num_folds = 5
#.columns를 통해 변수 지정
features = [c for c in train.columns ]

#계층별 kfold
folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=2319)

oof = np.zeros(len(train))
getVal = np.zeros(len(train))
predictions = np.zeros(len(target))
feature_importance_df = pd.DataFrame()

print('Light GBM Model')
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    
    X_train, y_train = train.iloc[trn_idx][features], target.iloc[trn_idx]
    X_valid, y_valid = train.iloc[val_idx][features], target.iloc[val_idx]
    
    N = 3
    p_valid,yp = 0,0
    for i in range(N):
        X_tr, y_tr = augment(X_train.values, y_train.values)
        X_tr = pd.DataFrame(X_tr)
    
        trn_data = lgb.Dataset(X_tr, label=y_tr)
        val_data = lgb.Dataset(X_valid, label=y_valid)
    
        clf = lgb.train(param, trn_data, 10000 ,valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 4000)
        oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
        getVal[val_idx]+= clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Light GBM Model
Fold idx:1
Training until validation scores don't improve for 4000 rounds.
[1000]	training's auc: 0.893906	valid_1's auc: 0.882102
[2000]	training's auc: 0.901813	valid_1's auc: 0.888329
[3000]	training's auc: 0.906758	valid_1's auc: 0.89189
[4000]	training's auc: 0.910549	valid_1's auc: 0.8946
[5000]	training's auc: 0.91355	valid_1's auc: 0.896134
[6000]	training's auc: 0.916105	valid_1's auc: 0.89745
[7000]	training's auc: 0.918359	valid_1's auc: 0.898284
[8000]	training's auc: 0.920438	valid_1's auc: 0.898861
[9000]	training's auc: 0.922409	valid_1's auc: 0.899236
[10000]	training's auc: 0.924238	valid_1's auc: 0.899465
Did not meet early stopping. Best iteration is:
[10000]	training's auc: 0.924238	valid_1's auc: 0.899465
Fold idx:2
Training until validation scores don't improve for 4000 rounds.
[1000]	training's auc: 0.894562	valid_1's auc: 0.878551
[2000]	training's auc: 0.902585	valid_1's auc: 0.885767
[3000]	training's auc: 0.90775	valid_1's auc: 0.889882
[4000]

In [10]:
##submission
sub_df1 = pd.DataFrame({"ID_code":test_id})
sub_df1["target"] = predictions
sub_df1.to_csv("lgboost_oof_augment.csv", index=False)