In [3]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
train_df  = pd.read_csv('train.csv/train.csv')

In [15]:
X = train_df.drop(['ID_code','target'], axis=1)

In [16]:
Y = train_df['target']

In [18]:
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
target = train_df['target']

In [19]:
test_df = pd.read_csv('test.csv/test.csv')

In [20]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.33,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.04,
    'learning_rate': 0.008,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1
}

In [21]:
num_round = 1000000
# check random state 44000
folds = StratifiedKFold(n_splits=12, shuffle=False, random_state=12345)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))

In [22]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3500)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Fold 0
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.900376	valid_1's auc: 0.883057
[2000]	training's auc: 0.909862	valid_1's auc: 0.889854
[3000]	training's auc: 0.915873	valid_1's auc: 0.893823
[4000]	training's auc: 0.920528	valid_1's auc: 0.896393
[5000]	training's auc: 0.924468	valid_1's auc: 0.898023
[6000]	training's auc: 0.927883	valid_1's auc: 0.899168
[7000]	training's auc: 0.931079	valid_1's auc: 0.900003
[8000]	training's auc: 0.934047	valid_1's auc: 0.900537
[9000]	training's auc: 0.936904	valid_1's auc: 0.90102
[10000]	training's auc: 0.939671	valid_1's auc: 0.901136
[11000]	training's auc: 0.942323	valid_1's auc: 0.901213
[12000]	training's auc: 0.944887	valid_1's auc: 0.901298
[13000]	training's auc: 0.9473	valid_1's auc: 0.901264
[14000]	training's auc: 0.949708	valid_1's auc: 0.901194
[15000]	training's auc: 0.952079	valid_1's auc: 0.901126
[16000]	training's auc: 0.954351	valid_1's auc: 0.901091
Early stopping, best iteratio

[8000]	training's auc: 0.934162	valid_1's auc: 0.898395
[9000]	training's auc: 0.936993	valid_1's auc: 0.898748
[10000]	training's auc: 0.939755	valid_1's auc: 0.89883
[11000]	training's auc: 0.942385	valid_1's auc: 0.898848
[12000]	training's auc: 0.944925	valid_1's auc: 0.899008
[13000]	training's auc: 0.947384	valid_1's auc: 0.898988
[14000]	training's auc: 0.949754	valid_1's auc: 0.89908
[15000]	training's auc: 0.952099	valid_1's auc: 0.89904
[16000]	training's auc: 0.954334	valid_1's auc: 0.899064
[17000]	training's auc: 0.956471	valid_1's auc: 0.899008
Early stopping, best iteration is:
[14425]	training's auc: 0.950776	valid_1's auc: 0.899195
Fold 9
Training until validation scores don't improve for 3500 rounds.
[1000]	training's auc: 0.899461	valid_1's auc: 0.887219
[2000]	training's auc: 0.908959	valid_1's auc: 0.894578
[3000]	training's auc: 0.915134	valid_1's auc: 0.898157
[4000]	training's auc: 0.920081	valid_1's auc: 0.900738
[5000]	training's auc: 0.924012	valid_1's auc: 0

In [23]:
submission = pd.DataFrame({"ID_code": test_df.ID_code.values})
submission["target"] = predictions
submission.to_csv("submission_final.csv", index=False)