In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
%run "Data Prep.ipynb"

In [None]:
#tuned param for xgboost
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9,
         'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True, 'seed': 42}

In [None]:
#Save predictions from first level models to train and predict on stacking ensemble
stack_train = np.zeros((df_train.shape[0], 2))
stack_test = np.zeros((df_test.shape[0], 2))
stack_testj = np.zeros((df_test.shape[0], 5))

#Convert test set to xgb format
d_test = xgb.DMatrix(df_test)

In [None]:
#Split folds
kfold = 5
skf = StratifiedKFold(n_splits=kfold, shuffle = True, random_state=42)


for i, (train_index, test_index) in enumerate(skf.split(df_train, dfy_train)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = df_train.iloc[train_index], df_train.iloc[test_index]
    Y_train, Y_valid = dfy_train.iloc[train_index], dfy_train.iloc[test_index]
    
    # Convert our data into XGBoost format
    d_train = xgb.DMatrix(X_train, Y_train)
    d_valid = xgb.DMatrix(X_valid, Y_valid)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    
    #Train xgboost model
    mdl = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=200,
                    feval=gini_xgb, maximize=True, verbose_eval=100)
    
    #Save predictions for training the stacking ensemble
    p_valid =mdl.predict(xgb.DMatrix(X_valid))
    stack_train[test_index, 0] = p_valid
    
    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    
    # Predict on our test data
    stack_testj[:, i] = mdl.predict(d_test)
    
#Get avg prediction of the folds
stack_test[:, 0] = stack_testj.mean(axis = 1)

In [None]:
params2 = {'learning_rate': 0.02, 'max_depth': 4, 'boosting': 'gbdt', 
          'objective': 'binary', 'metric': 'auc', 'is_training_metric': False, 'seed': 42}

In [None]:
#Split folds
kfolds = 5
skf = StratifiedKFold(n_splits=kfold, shuffle = True, random_state=42)

for i, (train_index, test_index) in enumerate(skf.split(df_train, dfy_train)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = df_train.iloc[train_index], df_train.iloc[test_index]
    Y_train, Y_valid = dfy_train.iloc[train_index], dfy_train.iloc[test_index]
    
    #Convert to lgb format
    l_train = lgb.Dataset(X_train, label=Y_train)
    l_valid = lgb.Dataset(X_valid, label=Y_valid)
    
    #Train model
    mdl2 = lgb.train(params2, l_train, 2000, l_valid, verbose_eval=100, 
                  feval=gini_lgb, early_stopping_rounds=200)
    
    #Save predictions for training the stacking ensemble
    stack_train[test_index, 1] = mdl2.predict(X_valid, num_iteration=mdl2.best_iteration)
    
    #Predict on test set
    stack_testj[:, i] = mdl2.predict(df_test, num_iteration=mdl2.best_iteration)

#Get avg predictions
stack_test[:, 1] = stack_testj.mean(axis = 1)

In [None]:
#Stacking Model (level 2)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(stack_train, dfy_train)

In [None]:
p_test = clf.predict_proba(stack_test)[:,1]

In [None]:
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = p_test
sub.to_csv('stack1.csv', index=False)