In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold


train_rd = pd.read_csv('train.csv')
test_rd = pd.read_csv('test.csv')

group_0 = train_rd.loc[train_rd['target']==0]
group_1 = train_rd.loc[train_rd['target']==1]

In [1]:
features = [c for c in train_rd.columns if c not in ['ID_code', 'target']]
target = train_rd['target']

param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.333,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.008,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
}

NameError: name 'train_rd' is not defined

In [3]:
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        
        # shuffle each cols(var) order
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y



In [6]:
folds = StratifiedKFold(n_splits=15, shuffle=True,random_state=42)
predictions = np.zeros(len(test_rd))



for fold_count, (train_idx, valid_idx) in enumerate(folds.split(train_rd.values, target.values)):
    
    train_X,train_y = train_rd.iloc[train_idx][features] , target.iloc[train_idx]
    valid_X,valid_Y = train_rd.iloc[valid_idx][features] , target.iloc[valid_idx]
    
    N = 5
    
    
    for i in range(N):
        
        double_X,double_Y = augment(train_X.values,train_y.values)
        
    
        train_data = lgb.Dataset(double_X,label=double_Y)
        valid_data = lgb.Dataset(valid_X,label=valid_Y)

        clf = lgb.train(param, train_data, 50000, valid_sets = valid_data, verbose_eval=1000, early_stopping_rounds = 2500)

        predictions += clf.predict(test_rd[features], num_iteration=clf.best_iteration)/(folds.n_splits*N)
    
    
    print('\nfold %d completed\n' % fold_count)

Training until validation scores don't improve for 2500 rounds.
[1000]	valid_0's auc: 0.880148
[2000]	valid_0's auc: 0.887472
[3000]	valid_0's auc: 0.891998
[4000]	valid_0's auc: 0.894494
[5000]	valid_0's auc: 0.89648
[6000]	valid_0's auc: 0.897876
[7000]	valid_0's auc: 0.898763
[8000]	valid_0's auc: 0.899367
[9000]	valid_0's auc: 0.899661
[10000]	valid_0's auc: 0.899949
[11000]	valid_0's auc: 0.899957
[12000]	valid_0's auc: 0.900053
[13000]	valid_0's auc: 0.900094
[14000]	valid_0's auc: 0.90002
[15000]	valid_0's auc: 0.899937
Early stopping, best iteration is:
[12517]	valid_0's auc: 0.900114
Training until validation scores don't improve for 2500 rounds.
[1000]	valid_0's auc: 0.880029
[2000]	valid_0's auc: 0.88769
[3000]	valid_0's auc: 0.891973
[4000]	valid_0's auc: 0.894625
[5000]	valid_0's auc: 0.89682
[6000]	valid_0's auc: 0.898158
[7000]	valid_0's auc: 0.899027
[8000]	valid_0's auc: 0.899567
[9000]	valid_0's auc: 0.899942
[10000]	valid_0's auc: 0.900371
[11000]	valid_0's auc: 0.90

Training until validation scores don't improve for 2500 rounds.
[1000]	valid_0's auc: 0.884278
[2000]	valid_0's auc: 0.891896
[3000]	valid_0's auc: 0.896883
[4000]	valid_0's auc: 0.900342
[5000]	valid_0's auc: 0.902764
[6000]	valid_0's auc: 0.904144
[7000]	valid_0's auc: 0.905179
[8000]	valid_0's auc: 0.905815
[9000]	valid_0's auc: 0.906227
[10000]	valid_0's auc: 0.906483
[11000]	valid_0's auc: 0.906628
[12000]	valid_0's auc: 0.906683
[13000]	valid_0's auc: 0.906693
[14000]	valid_0's auc: 0.906756
[15000]	valid_0's auc: 0.906746
[16000]	valid_0's auc: 0.906735
Early stopping, best iteration is:
[13830]	valid_0's auc: 0.906761

The 3 fold complete

Training until validation scores don't improve for 2500 rounds.
[1000]	valid_0's auc: 0.877057
[2000]	valid_0's auc: 0.885357
[3000]	valid_0's auc: 0.890126
[4000]	valid_0's auc: 0.893264
[5000]	valid_0's auc: 0.895333
[6000]	valid_0's auc: 0.897008
[7000]	valid_0's auc: 0.898015
[8000]	valid_0's auc: 0.898809
[9000]	valid_0's auc: 0.899288
[

In [14]:
result = pd.DataFrame({"ID_code":test_rd["ID_code"].values})

result['target'] = predictions

result.to_csv("submission.csv", index=False)