# Porto Seguro’s Safe Driver Prediction

I got Felipe Antunes code as a startpack: https://github.com/felipeeeantunes/udacity_live

## Initializing

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import gc
from time import time
from multiprocessing import *

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')
pd.options.display.float_format = '{:.2f}'.format
rc={'savefig.dpi': 75, 'figure.autolayout': False, 'figure.figsize': [12, 8], 'axes.labelsize': 18,\
   'axes.titlesize': 18, 'font.size': 18, 'lines.linewidth': 2.0, 'lines.markersize': 8, 'legend.fontsize': 16,\
   'xtick.labelsize': 16, 'ytick.labelsize': 16}

sns.set(style='dark',rc=rc)

In [3]:
default_color = '#56B4E9'
colormap = plt.cm.cool

In [4]:
# Setting working directory
path = '../data/raw/'

## Loading Files

In [5]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

In [6]:
y = train['target']
del train['target']

In [7]:
y.head(5)

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [8]:
id_train = train['id'].values
id_test = test['id'].values

In [9]:
columns_original = list(train.columns)
columns_original

['id',
 'ps_ind_01',
 'ps_ind_02_cat',
 'ps_ind_03',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_14',
 'ps_ind_15',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin',
 'ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat',
 'ps_car_11',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15',
 'ps_calc_01',
 'ps_calc_02',
 'ps_calc_03',
 'ps_calc_04',
 'ps_calc_05',
 'ps_calc_06',
 'ps_calc_07',
 'ps_calc_08',
 'ps_calc_09',
 'ps_calc_10',
 'ps_calc_11',
 'ps_calc_12',
 'ps_calc_13',
 'ps_calc_14',
 'ps_calc_15_bin',
 'ps_calc_16_bin',
 'ps_calc_17_bin',
 'ps_calc_18_bin',
 'ps_calc_19_bin',
 'ps_calc_20_bin']

In [10]:
train.head(5)

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
1,9,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
2,13,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
3,16,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [22]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score



In [62]:
def cross_val_model(X, y, model, n_splits=5):
   
    X = np.array(X)
    y = np.array(y)

    folds = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15).split(X, y))

    cross_score_mean = 0.0

    t0 = time()
    
    for j, (train_idx, test_idx) in enumerate(folds):
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_holdout = X[test_idx]
        y_holdout = y[test_idx]

        print ("Fit %s fold %d" % (str(model).split('(')[0], j+1))
        model.fit(X_train, y_train)
        cross_score = cross_val_score(model, X_holdout, y_holdout, cv=3, scoring='roc_auc')
        print("    cross_score: %.5f (%.5f)" % (cross_score.mean(), cross_score.mean()*2-1)) 
        print("    [%10d secs elapsed]: cross_score: %.5f (%.5f)" % (time()-t0, cross_score.mean(), cross_score.mean()*2-1)) 
        cross_score_mean += cross_score.mean()
        
    cross_score_mean /= n_splits
    print("cross_score_mean: %.5f (%.5f)" % (cross_score_mean, cross_score_mean*2-1))

## Feature Engineering & Selection

In [24]:
# Selected features from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code
selected_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
    "ps_reg_03",  #            : 1408.42 / shadow  511.15
    "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
    "ps_ind_03",  #            : 1219.47 / shadow  230.55
    "ps_ind_15",  #            :  922.18 / shadow  242.00
    "ps_reg_02",  #            :  920.65 / shadow  267.50
    "ps_car_14",  #            :  798.48 / shadow  549.58
    "ps_car_12",  #            :  731.93 / shadow  293.62
    "ps_car_01_cat",  #        :  698.07 / shadow  178.72
    "ps_car_07_cat",  #        :  694.53 / shadow   36.35
    "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
    "ps_car_03_cat",  #        :  611.73 / shadow   50.67
    "ps_reg_01",  #            :  598.60 / shadow  178.57
    "ps_car_15",  #            :  593.35 / shadow  226.43
    "ps_ind_01",  #            :  547.32 / shadow  154.58
    "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
    "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
    "ps_car_06_cat",  #        :  398.02 / shadow  212.43
    "ps_car_04_cat",  #        :  376.87 / shadow   76.98
    "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
    "ps_car_09_cat",  #        :  214.12 / shadow   81.38
    "ps_car_02_cat",  #        :  203.03 / shadow   26.67
    "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
    "ps_car_11",  #            :  173.28 / shadow   76.45
    "ps_car_05_cat",  #        :  172.75 / shadow   62.92
    "ps_calc_09",  #           :  169.13 / shadow  129.72
    "ps_calc_05",  #           :  148.83 / shadow  120.68
    "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
    "ps_car_08_cat",  #        :  120.87 / shadow   28.82
    "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
    "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
    "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
    "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
    "ps_ind_14",  #            :   37.37 / shadow   16.65
    "ps_car_11_cat" # Very nice spot from Tilii : https://www.kaggle.com/tilii7
]

### Adding Combs

In [25]:
train.head(5)

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
1,9,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
2,13,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
3,16,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
# add combinations from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]
start = time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f' % (name1, n_c + 1, (time() - start) / 60), end='')
    print('\r' * 75, end='')
    train[name1] = train[f1].apply(lambda x: str(x)) + "_" + train[f2].apply(lambda x: str(x))
    test[name1] = test[f1].apply(lambda x: str(x)) + "_" + test[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train[name1].values) + list(test[name1].values))
    train[name1] = lbl.transform(list(train[name1].values))
    test[name1] = lbl.transform(list(test[name1].values))

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.0

In [28]:
len(selected_features)

35

In [29]:
new_features = [f1 + '_plus_' + f2 for (f1, f2) in combs]
selected_features.extend(new_features)
new_features

['ps_reg_01_plus_ps_car_02_cat', 'ps_reg_01_plus_ps_car_04_cat']

In [30]:
len(selected_features)

37

### Categorical Target Encoding for categorical variables

In [31]:
# from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

In [32]:
# adapted from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code
def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [33]:
f_cats = [x for x in selected_features if  "_cat" in x]

In [34]:
for f in f_cats:
    train[f + "_avg"], test[f + "_avg"] = target_encode(trn_series=train[f],
                                         tst_series=test[f],
                                         target=y,
                                         min_samples_leaf=200,
                                         smoothing=10,
                                         noise_level=0)

In [35]:
# verify transformation
train.head(3)

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_car_04_cat_avg,ps_car_09_cat_avg,ps_car_02_cat_avg,ps_ind_02_cat_avg,ps_car_05_cat_avg,ps_car_08_cat_avg,ps_ind_04_cat_avg,ps_car_11_cat_avg,ps_reg_01_plus_ps_car_02_cat_avg,ps_reg_01_plus_ps_car_04_cat_avg
0,7,2,2,5,1,0,0,1,0,0,...,0.03,0.03,0.03,0.04,0.04,0.04,0.04,0.04,0.04,0.04
1,9,1,1,7,0,0,0,0,1,0,...,0.03,0.04,0.03,0.04,0.03,0.03,0.03,0.02,0.04,0.04
2,13,5,4,9,1,0,0,0,1,0,...,0.03,0.04,0.03,0.04,0.03,0.03,0.04,0.03,0.04,0.04


In [36]:
new_te_columns = [c for c in train.columns if '_avg' in c]
new_te_columns

['ps_ind_05_cat_avg',
 'ps_car_01_cat_avg',
 'ps_car_07_cat_avg',
 'ps_car_03_cat_avg',
 'ps_car_06_cat_avg',
 'ps_car_04_cat_avg',
 'ps_car_09_cat_avg',
 'ps_car_02_cat_avg',
 'ps_ind_02_cat_avg',
 'ps_car_05_cat_avg',
 'ps_car_08_cat_avg',
 'ps_ind_04_cat_avg',
 'ps_car_11_cat_avg',
 'ps_reg_01_plus_ps_car_02_cat_avg',
 'ps_reg_01_plus_ps_car_04_cat_avg']

In [37]:
selected_features.extend(new_te_columns)
### VERIFY: for x in new_te_columns: selected_features_te.remove(x[:-4])
selected_features

['ps_car_13',
 'ps_reg_03',
 'ps_ind_05_cat',
 'ps_ind_03',
 'ps_ind_15',
 'ps_reg_02',
 'ps_car_14',
 'ps_car_12',
 'ps_car_01_cat',
 'ps_car_07_cat',
 'ps_ind_17_bin',
 'ps_car_03_cat',
 'ps_reg_01',
 'ps_car_15',
 'ps_ind_01',
 'ps_ind_16_bin',
 'ps_ind_07_bin',
 'ps_car_06_cat',
 'ps_car_04_cat',
 'ps_ind_06_bin',
 'ps_car_09_cat',
 'ps_car_02_cat',
 'ps_ind_02_cat',
 'ps_car_11',
 'ps_car_05_cat',
 'ps_calc_09',
 'ps_calc_05',
 'ps_ind_08_bin',
 'ps_car_08_cat',
 'ps_ind_09_bin',
 'ps_ind_04_cat',
 'ps_ind_18_bin',
 'ps_ind_12_bin',
 'ps_ind_14',
 'ps_car_11_cat',
 'ps_reg_01_plus_ps_car_02_cat',
 'ps_reg_01_plus_ps_car_04_cat',
 'ps_ind_05_cat_avg',
 'ps_car_01_cat_avg',
 'ps_car_07_cat_avg',
 'ps_car_03_cat_avg',
 'ps_car_06_cat_avg',
 'ps_car_04_cat_avg',
 'ps_car_09_cat_avg',
 'ps_car_02_cat_avg',
 'ps_ind_02_cat_avg',
 'ps_car_05_cat_avg',
 'ps_car_08_cat_avg',
 'ps_ind_04_cat_avg',
 'ps_car_11_cat_avg',
 'ps_reg_01_plus_ps_car_02_cat_avg',
 'ps_reg_01_plus_ps_car_04_cat_avg'

In [69]:
positive_cases = len(y[y == 1])
negative_cases = len(y) - positive_cases
positive_cases, negative_cases, len(y)

(21694, 573518, 595212)

In [116]:
# parameters from https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283/code
conf_xgb_model = {
    'n_estimators': 200,
    'max_depth': 4,
    'objective': 'binary:logistic',
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 1,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'nthread': -1,
    'min_child_weight': 100,
}

In [117]:
xgb_model = XGBClassifier(**conf_xgb_model)

In [41]:
cross_val_model(train[selected_features], y, xgb_model)

Fit XGBClassifier fold 1
    cross_score: 0.63381 (0.26763)
    [       153 secs elapsed]: cross_score: 0.63381 (0.26763)
Fit XGBClassifier fold 2
    cross_score: 0.63389 (0.26778)
    [       317 secs elapsed]: cross_score: 0.63389 (0.26778)
Fit XGBClassifier fold 3
    cross_score: 0.63737 (0.27475)
    [       483 secs elapsed]: cross_score: 0.63737 (0.27475)
cross_score_mean: 0.63503 (0.27005)


In [43]:
conf_lgb_model = {
    'boosting_type': 'gbdt',
    'n_estimators': 200,
    'max_depth': 4,
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'sub_feature': 0.8,
    'num_leaves': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 3,
}

In [44]:
lgb_model = LGBMClassifier(**conf_lgb_model)

In [45]:
cross_val_model(train[selected_features], y, lgb_model)

Fit LGBMClassifier fold 1
    cross_score: 0.63584 (0.27168)
    [        22 secs elapsed]: cross_score: 0.63584 (0.27168)
Fit LGBMClassifier fold 2
    cross_score: 0.63568 (0.27137)
    [        46 secs elapsed]: cross_score: 0.63568 (0.27137)
Fit LGBMClassifier fold 3
    cross_score: 0.63941 (0.27883)
    [        67 secs elapsed]: cross_score: 0.63941 (0.27883)
cross_score_mean: 0.63698 (0.27396)


## Ensembling

In [118]:
increase = False

class Ensemble(object):

    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models
        self.S_train = None
        self.S_test = None

    def fit_predict(self, X, y, T):
        t0 = time()
        
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        
        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=15).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]

                # Upsample during cross validation to avoid having the same samples
                # in both train and validation sets
                # Validation set is not up-sampled to monitor overfitting
                if increase:
                    # Get positive examples
                    pos = pd.Series(y_train == 1)
                    
                    # Add positive examples
                    X_train = pd.concat([X_train, X_train.loc[pos]], axis=0)
                    y_train = pd.concat([y_train, y_train.loc[pos]], axis=0)
                    # Shuffle data
                    idx = np.arange(len(X_train))
                    np.random.shuffle(idx)
                    X_train = X_train.iloc[idx]
                    y_train = y_train.iloc[idx]
                
                
                print ("Fitting %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]
                cross_score = cross_val_score(clf, X_holdout, y_holdout, cv=3, scoring='roc_auc')
                print("    [%10d secs elapsed]: cross_score: %.5f (%.5f)" % (time()-t0, cross_score.mean(), cross_score.mean()*2-1)) 
        

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)
            #print("    [%10d secs elapsed]: Stacker Score: %.5f (%.5f)" % (time()-t0, results.mean(), results.mean()*2-1)) 

        self.S_train = S_train
        self.S_test = S_test
        
        if False:
            #cross validating stacker
            cross_val_model(S_train, y, self.stacker, n_splits=5)
        
            #Training with all training set (including validation)
            self.stacker.fit(S_train, y)
            res = self.stacker.predict_proba(S_test)[:,1]
        else:
            res = S_test.mean(axis=1)
            print(res)
        return res

In [119]:
conf_log = {
    'penalty':'l2',
    'dual':False,
    'tol':0.0001,
    'C':1.0,
    'fit_intercept':True,
    'intercept_scaling':1,
    'class_weight':None,
    'random_state':None,
    'solver':'liblinear',
    'max_iter':100,
    'multi_class':'ovr',
    'verbose':0,
    'warm_start':False,
    'n_jobs':1
}

In [120]:
log_model = LogisticRegression(**conf_log)

In [121]:
stack = Ensemble(n_splits=5,
        stacker = log_model,
        base_models = (xgb_model, lgb_model)) 

In [122]:
y_pred = stack.fit_predict(train[selected_features], y, test) 

Fitting XGBClassifier fold 1
    [       204 secs elapsed]: cross_score: 0.62980 (0.25961)
Fitting XGBClassifier fold 2
    [       412 secs elapsed]: cross_score: 0.62249 (0.24498)
Fitting XGBClassifier fold 3
    [       620 secs elapsed]: cross_score: 0.62333 (0.24666)
Fitting XGBClassifier fold 4
    [       827 secs elapsed]: cross_score: 0.62455 (0.24911)
Fitting XGBClassifier fold 5
    [      1034 secs elapsed]: cross_score: 0.62816 (0.25632)


NameError: name 'results' is not defined

In [None]:
y_pred.S_test

In [64]:
y_pred = stack.fit_predict(train[selected_features], y, test) 

Fitting XGBClassifier fold 1
Fitting XGBClassifier fold 2
Fitting XGBClassifier fold 3
Fitting LGBMClassifier fold 1
Fitting LGBMClassifier fold 2
Fitting LGBMClassifier fold 3
Fit LogisticRegression fold 1
    cross_score: 0.64368 (0.28736)
    [         1 secs elapsed]: cross_score: 0.64368 (0.28736)
Fit LogisticRegression fold 2
    cross_score: 0.63755 (0.27509)
    [         2 secs elapsed]: cross_score: 0.63755 (0.27509)
Fit LogisticRegression fold 3
    cross_score: 0.64188 (0.28375)
    [         3 secs elapsed]: cross_score: 0.64188 (0.28375)
Fit LogisticRegression fold 4
    cross_score: 0.64129 (0.28257)
    [         4 secs elapsed]: cross_score: 0.64129 (0.28257)
Fit LogisticRegression fold 5
    cross_score: 0.64244 (0.28488)
    [         5 secs elapsed]: cross_score: 0.64244 (0.28488)
cross_score_mean: 0.64137 (0.28273)


# Making a submission

In [61]:
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_pred
sub.to_csv('stacked_xgb_lgb_v4.csv', index=False)