## E(b). BUREAU BALANCE TIME SERIES FEATURE EXTRACTION


Train GRU network on bureau balance time series data. Save prediction to be used as features in final training.

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
import gc

import os
print(os.listdir('C:/Users/monis/Desktop/Home credit risk'))
    
gc.enable()

['.ipynb_checkpoints', 'Automated Hyper parameter tuning.ipynb', 'baseline_lgb.csv', 'baseline_lgb_domain_features.csv', 'bayes_test.csv', 'H20AutoML.ipynb', 'HomeCreditRiskKaggle.ipynb', 'Kaggle competition.docx', 'LightGBM with Simple Features.ipynb', 'log_reg_baseline.csv', 'random_forest_baseline.csv', 'random_forest_baseline_domain.csv', 'sample_submission.csv']


Read bureau balance data and create features.

In [2]:
buro = pd.read_csv('bureau.csv')
buro_id_map = buro.groupby('SK_ID_BUREAU')['SK_ID_CURR'].min()
buro.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [3]:
#buro has 1716428 SK_ID_BUREAU
#bubl has 817395 SK_ID_BUREAU
#942074 buro_id in buro not present in bubl
#43041 buro_id in bubl not present in buro
#interesting...
bubl = pd.read_csv('bureau_balance.csv')
bubl['STATUS_COMPLETE'] = 0
bubl['STATUS_COMPLETE'].loc[bubl['STATUS']=='C'] = 1
bubl['STATUS_X'] = 0
bubl['STATUS_X'].loc[bubl['STATUS']=='X'] = 1
bubl['STATUS_DPD'] = -1
bubl['STATUS_DPD'].loc[bubl['STATUS'].isin(['0','1','2','3','4','5'])] = bubl['STATUS']
bubl['STATUS_DPD'] = bubl['STATUS_DPD'].astype('int32')
bubl['SK_ID_CURR'] = bubl['SK_ID_BUREAU'].map(buro_id_map)
bubl = bubl.loc[bubl['SK_ID_CURR'].notna()]
bubl['SK_ID_CURR'] = bubl['SK_ID_CURR'].astype('int')
bubl.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS,STATUS_COMPLETE,STATUS_X,STATUS_DPD,SK_ID_CURR
0,5715448,0,C,1,0,-1,380361
1,5715448,-1,C,1,0,-1,380361
2,5715448,-2,C,1,0,-1,380361
3,5715448,-3,C,1,0,-1,380361
4,5715448,-4,C,1,0,-1,380361


Read target from main table.

In [4]:
data_app = pd.read_csv('application_train.csv',usecols=['SK_ID_CURR','TARGET'])
data_test = pd.read_csv('application_test.csv',usecols=['SK_ID_CURR'])
data_app.shape, data_test.shape

((307511, 2), (48744, 1))

In [5]:
trn_id = data_app['SK_ID_CURR'].loc[data_app.SK_ID_CURR.isin(bubl.SK_ID_CURR)]
test_id = data_test['SK_ID_CURR'].loc[data_test['SK_ID_CURR'].isin(bubl.SK_ID_CURR)]
trn_id.shape, test_id.shape

((92231,), (42311,))

Split train and test set. Groupby ID and month to create time series.

In [6]:
bubl_trn = bubl.loc[bubl.SK_ID_CURR.isin(trn_id)]
bubl_test = bubl.loc[bubl.SK_ID_CURR.isin(test_id)]
num_aggregations = {
    'SK_ID_BUREAU' : ['count'],
    'STATUS_COMPLETE': ['sum'],
    'STATUS_X': ['sum'],
    'STATUS_DPD': ['sum','mean','max'],
}
bubl_trn = bubl_trn.groupby(['SK_ID_CURR','MONTHS_BALANCE']).agg(num_aggregations)
bubl_test = bubl_test.groupby(['SK_ID_CURR','MONTHS_BALANCE']).agg(num_aggregations)
bubl_trn.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bubl_trn.columns.tolist()])
bubl_test.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bubl_test.columns.tolist()])
bubl_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SK_ID_BUREAU_COUNT,STATUS_COMPLETE_SUM,STATUS_X_SUM,STATUS_DPD_SUM,STATUS_DPD_MEAN,STATUS_DPD_MAX
SK_ID_CURR,MONTHS_BALANCE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100001,-51,1,0,1,-1,-1.0,-1
100001,-50,1,0,1,-1,-1.0,-1
100001,-49,1,0,1,-1,-1.0,-1
100001,-48,1,0,1,-1,-1.0,-1
100001,-47,1,0,1,-1,-1.0,-1


Convert dataframe to 3D array (n_sample * n_time_step * n_features) for GRU network training.

In [7]:
train_x = bubl_trn.to_panel().to_xarray().values
train_x = train_x.swapaxes(0,1).swapaxes(1,2)
test_x = bubl_test.to_panel().to_xarray().values
test_x = test_x.swapaxes(0,1).swapaxes(1,2)
train_x[np.isnan(train_x)]=-9
test_x[np.isnan(test_x)]=-9
train_y = data_app['TARGET'].loc[data_app.SK_ID_CURR.isin(trn_id)]
train_x.shape, test_x.shape, train_y.shape

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  """Entry point for launching an IPython kernel.
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  This is separate from the ipykernel package so we can avoid doing imports until


((92231, 97, 6), (42311, 97, 6), (92231,))

#### Define GRU model. Use callback to evaluate auc metric.

In [8]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, GRU
from keras.regularizers import l2
from keras.optimizers import RMSprop, Adam

def build_model(time_step, n_features):
    model = Sequential()
    model.add(GRU(4, input_shape=(time_step, n_features))) #unit: #of neurons in each LSTM cell? input_shape=(time_step, n_features)
    model.add(Dense(1,activation='sigmoid'))
    return model

from keras.callbacks import Callback
from keras.callbacks import EarlyStopping
import logging

class IntervalEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == (self.interval-1):
            y_pred = self.model.predict(self.X_val, verbose=0)[:,0]
            score = roc_auc_score(self.y_val, y_pred)
            print('roc score',score)

Using TensorFlow backend.


Training...

In [9]:
# Run a 5 fold
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
oof_preds = np.zeros(train_x.shape[0])
sub_preds = np.zeros(test_x.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_x, train_y)):
    trn_x, val_x = train_x[trn_idx], train_x[val_idx]
    trn_y, val_y = train_y.values[trn_idx], train_y.values[val_idx]
    ival = IntervalEvaluation(validation_data=(val_x, val_y), interval=5)
    
    model = build_model(trn_x.shape[1],trn_x.shape[2])
    model.compile(loss='binary_crossentropy', optimizer=Adam(decay=0.0002))
    model.fit(trn_x, trn_y,
              validation_data = [val_x, val_y],
              epochs=40, batch_size=5000, 
              class_weight = {0:1,1:10},
              callbacks=[ival], verbose=5)
    
    oof_preds[val_idx] = model.predict(val_x)[:,0]
    sub_preds += model.predict(test_x)[:,0] / folds.n_splits
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
  
    del model, trn_x, trn_y, val_x, val_y
    gc.collect()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 73784 samples, validate on 18447 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
roc score 0.5181226054844156
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
roc score 0.5304963697464068
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
roc score 0.5361157877821211
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
roc score 0.540829066703233
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
roc score 0.5455003046984861
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
roc score 0.5503918646486499
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
roc score 0.5568373868774946
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
roc score 0.5619063045279649
Fold  1 AUC : 0.561906
Train on 73785 samples, validate on 18446 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40


Save model prediction to disk.

In [11]:
bubl_score_train = pd.DataFrame({'bubl_score':oof_preds}, index=trn_id)
bubl_score_test = pd.DataFrame({'bubl_score':sub_preds}, index=test_id)             
bubl_score_train.to_csv('bubl_score_train.csv')
bubl_score_test.to_csv('bubl_score_test.csv')