In [None]:
# lightgbm ensemble

In [1]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
train_df=pd.read_csv('train_df_select_feature.csv')

In [3]:
test_df=pd.read_csv('test_df_select_feature.csv')

In [4]:
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]

In [5]:
train=train_df[feats]
train_target=train_df['TARGET']
test=test_df[feats]

In [6]:
# set the parameter
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 10000
lgb_params['nthread']=4
lgb_params['num_leaves']=32
lgb_params['colsample_bytree'] = 0.9497036
lgb_params['subsample'] = 0.8715623
lgb_params['max_depth']=8
lgb_params['reg_alpha']=0.04
lgb_params['reg_lambda']=0.073
lgb_params['min_split_gain']=0.0222415
lgb_params['min_child_weight']=40
lgb_params['silent']=-1
lgb_params['verbose']=-1
lgb_model = LGBMClassifier(**lgb_params)

In [7]:
lgb_params2 = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 10000
lgb_params['nthread']=4
lgb_params['num_leaves']=44
lgb_params['colsample_bytree'] = 0.1735
lgb_params['subsample'] = 0.9619
lgb_params['max_depth']=9
lgb_params['reg_alpha']=0.8082
lgb_params['reg_lambda']=0.7325
lgb_params['min_split_gain']=0.0403
lgb_params['min_child_weight']=57.4218
lgb_params['scale_pos_weight']=1.3281
lgb_params['silent']=-1
lgb_params['verbose']=-1
lgb_model2 = LGBMClassifier(**lgb_params2)

In [8]:
lgb_params3 = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 10000
lgb_params['nthread']=4
lgb_params['num_leaves']=45
lgb_params['colsample_bytree'] = 0.1518
lgb_params['subsample'] = 0.9051
lgb_params['max_depth']=8
lgb_params['reg_alpha']=0.5678
lgb_params['reg_lambda']=0.9661
lgb_params['min_split_gain']=0.0928
lgb_params['min_child_weight']=68.5673
lgb_params['scale_pos_weight']=1.2034
lgb_params['silent']=-1
lgb_params['verbose']=-1
lgb_model3 = LGBMClassifier(**lgb_params3)

In [9]:
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)
            
        results = cross_val_score(self.stacker, S_train, y, cv=3, scoring='roc_auc',early_stopping_rounds= 200)
        print("Stacker score: %.5f" % (results.mean()))
        
        self.stacker.fit(S_train, y)
        res = self.stacker.predict_proba(S_test)[:,1]
        return res

In [10]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model = LogisticRegression()
stack = Ensemble(n_splits=3,
        stacker = log_model,
        base_models = (lgb_model, lgb_model2, lgb_model3))        

In [None]:
from sklearn.model_selection import cross_val_score
y_pred = stack.fit_predict(train, train_target, test)        
test_df['TARGET'] = y_pred
test_df[['SK_ID_CURR', 'TARGET']].to_csv('ensemble_submission.csv', index= False)



Fit LGBMClassifier fold 1
