## E(d). Credit card balance time series feature extraction

Train GRU network on credit card balance time series data. Save prediction to be used as features in final training.

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
import gc

import os
print(os.listdir("../input"))
    
gc.enable()

['download_command.txt']


Scale data for NN training.

In [2]:
def scale_data(df_):
    df = df_.copy(deep=True)
    for f_ in df_.columns:
        if (df[f_].max()- df[f_].min() <=10):
            df[f_] = df[f_] - df[f_].min()
            continue
        df[f_] = df[f_] - df[f_].median()
        scale = (df[f_].quantile(0.99)-df[f_].quantile(0.01))
        if scale==0:
            scale = df[f_].max() - df[f_].min()
        df[f_] = df[f_]/scale
        if df[f_].max()>10:
            rescale = df[f_]>df[f_].quantile(0.99)
            quantile99 = df[f_].quantile(0.99)
            quantile100 = df[f_].max()
            df[f_].loc[rescale] = quantile99 + (df[f_].loc[rescale] - quantile99) * (10-quantile99)/(quantile100-quantile99)
        if df[f_].min()<-10:
            rescale = df[f_]<df[f_].quantile(0.01)
            quantile1 = df[f_].quantile(0.01)
            quantile0 = df[f_].min()
            df[f_].loc[rescale] = quantile1 + (df[f_].loc[rescale] - quantile1) * (-10-quantile1)/(quantile0-quantile1)
        df[f_] = df[f_] - df[f_].min()
    return df

Read credit card balance data and create features.

In [3]:
ccbl = pd.read_csv('credit_card_balance.csv')
    
ccbl = pd.concat([ccbl, pd.get_dummies(ccbl['NAME_CONTRACT_STATUS'], prefix='NAME_CONTRACT_STATUS')], axis=1)
del ccbl['NAME_CONTRACT_STATUS']

sum_feats = [f_ for f_ in ccbl.columns.values if ((f_.find('SK_ID_CURR')<0) & (f_.find('MONTHS_BALANCE')<0) & (f_.find('SK_ID_PREV')<0))]
print('sum_feats',sum_feats)
sum_ccbl_mon = ccbl.groupby(['SK_ID_CURR','MONTHS_BALANCE'])[sum_feats].sum()
sum_ccbl_mon['CNR_ACCOUNT_W_MONTH'] = ccbl.groupby(['SK_ID_CURR','MONTHS_BALANCE'])['SK_ID_PREV'].count()
ccbl = sum_ccbl_mon.reset_index()

#compute ratio after summing up account
ccbl['AMT_BALANCE_CREDIT_RATIO'] = (ccbl['AMT_BALANCE']/(ccbl['AMT_CREDIT_LIMIT_ACTUAL']+0.001)).clip(-100,100)
ccbl['AMT_CREDIT_USE_RATIO'] = (ccbl['AMT_DRAWINGS_CURRENT']/(ccbl['AMT_CREDIT_LIMIT_ACTUAL']+0.001)).clip(-100,100)
ccbl['AMT_DRAWING_ATM_RATIO'] = ccbl['AMT_DRAWINGS_ATM_CURRENT']/(ccbl['AMT_DRAWINGS_CURRENT']+0.001)
ccbl['AMT_DRAWINGS_OTHER_RATIO'] = ccbl['AMT_DRAWINGS_OTHER_CURRENT']/(ccbl['AMT_DRAWINGS_CURRENT']+0.001)
ccbl['AMT_DRAWINGS_POS_RATIO'] = ccbl['AMT_DRAWINGS_POS_CURRENT']/(ccbl['AMT_DRAWINGS_CURRENT']+0.001)
ccbl['AMT_PAY_USE_RATIO'] = ((ccbl['AMT_PAYMENT_TOTAL_CURRENT']+0.001)/(ccbl['AMT_DRAWINGS_CURRENT']+0.001)).clip(-100,100)
ccbl['AMT_BALANCE_RECIVABLE_RATIO'] = ccbl['AMT_BALANCE']/(ccbl['AMT_TOTAL_RECEIVABLE']+0.001)
ccbl['AMT_DRAWING_BALANCE_RATIO'] = ccbl['AMT_DRAWINGS_CURRENT']/(ccbl['AMT_BALANCE']+0.001)
ccbl['AMT_RECEIVABLE_PRINCIPAL_DIFF'] = ccbl['AMT_TOTAL_RECEIVABLE']-ccbl['AMT_RECEIVABLE_PRINCIPAL']
ccbl['AMT_PAY_INST_DIFF'] = ccbl['AMT_PAYMENT_CURRENT'] - ccbl['AMT_INST_MIN_REGULARITY']

rejected_features = ['AMT_RECIVABLE','AMT_RECEIVABLE_PRINCIPAL','AMT_DRAWINGS_ATM_CURRENT',
                     'AMT_DRAWINGS_OTHER_CURRENT','AMT_DRAWINGS_POS_CURRENT']
for f_ in rejected_features:
    del ccbl[f_]
    
ccbl.iloc[:,3:] = scale_data(ccbl.iloc[:,3:])

del sum_ccbl_mon
gc.collect()
ccbl.head()

sum_feats ['AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT', 'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY', 'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE', 'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'CNT_INSTALMENT_MATURE_CUM', 'SK_DPD', 'SK_DPD_DEF', 'NAME_CONTRACT_STATUS_Active', 'NAME_CONTRACT_STATUS_Approved', 'NAME_CONTRACT_STATUS_Completed', 'NAME_CONTRACT_STATUS_Demand', 'NAME_CONTRACT_STATUS_Refused', 'NAME_CONTRACT_STATUS_Sent proposal', 'NAME_CONTRACT_STATUS_Signed']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,...,AMT_BALANCE_CREDIT_RATIO,AMT_CREDIT_USE_RATIO,AMT_DRAWING_ATM_RATIO,AMT_DRAWINGS_OTHER_RATIO,AMT_DRAWINGS_POS_RATIO,AMT_PAY_USE_RATIO,AMT_BALANCE_RECIVABLE_RATIO,AMT_DRAWING_BALANCE_RATIO,AMT_RECEIVABLE_PRINCIPAL_DIFF,AMT_PAY_INST_DIFF
0,100006,-6,0.0,0.352941,0.039439,0.0,0.0,0.0,0.904595,0.0,...,10.0,0.025281,10.0,0.0,0.0,1.01,0.217997,10.0,10.0,1.550076
1,100006,-5,0.0,0.352941,0.039439,0.0,0.0,0.0,0.904595,0.0,...,10.0,0.025281,10.0,0.0,0.0,1.01,0.217997,10.0,10.0,1.550076
2,100006,-4,0.0,0.352941,0.039439,0.0,0.0,0.0,0.904595,0.0,...,10.0,0.025281,10.0,0.0,0.0,1.01,0.217997,10.0,10.0,1.550076
3,100006,-3,0.0,0.352941,0.039439,0.0,0.0,0.0,0.904595,0.0,...,10.0,0.025281,10.0,0.0,0.0,1.01,0.217997,10.0,10.0,1.550076
4,100006,-2,0.0,0.352941,0.039439,0.0,0.0,0.0,0.904595,0.0,...,10.0,0.025281,10.0,0.0,0.0,1.01,0.217997,10.0,10.0,1.550076


Read target from main table.

In [4]:
data_app = pd.read_csv('application_train.csv',usecols=['SK_ID_CURR','TARGET'])
data_test = pd.read_csv('application_test.csv',usecols=['SK_ID_CURR'])
data_app.shape, data_test.shape

((307511, 2), (48744, 1))

In [5]:
trn_id = data_app['SK_ID_CURR'].loc[data_app.SK_ID_CURR.isin(ccbl.SK_ID_CURR)]
test_id = data_test['SK_ID_CURR'].loc[data_test['SK_ID_CURR'].isin(ccbl.SK_ID_CURR)]
trn_id.shape, test_id.shape

((86905,), (16653,))

Split train and test set. Group by ID and month to create time series.

In [6]:
ccbl_trn = ccbl.loc[ccbl.SK_ID_CURR.isin(trn_id)]
ccbl_test = ccbl.loc[ccbl.SK_ID_CURR.isin(test_id)]
feats = ccbl.columns.values[2:]
ccbl_trn = ccbl_trn.groupby(['SK_ID_CURR','MONTHS_BALANCE'])[feats].sum() 
ccbl_test = ccbl_test.groupby(['SK_ID_CURR','MONTHS_BALANCE'])[feats].sum() 
#tmp = tmp.reset_index() 
#table = pd.pivot_table(tmp, index='SK_ID_CURR', columns='MONTHS_BALANCE', values=feats, fill_value=0) 
#table.head(10)
ccbl_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,...,AMT_BALANCE_CREDIT_RATIO,AMT_CREDIT_USE_RATIO,AMT_DRAWING_ATM_RATIO,AMT_DRAWINGS_OTHER_RATIO,AMT_DRAWINGS_POS_RATIO,AMT_PAY_USE_RATIO,AMT_BALANCE_RECIVABLE_RATIO,AMT_DRAWING_BALANCE_RATIO,AMT_RECEIVABLE_PRINCIPAL_DIFF,AMT_PAY_INST_DIFF
SK_ID_CURR,MONTHS_BALANCE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
100013,-96,0.0,0.205882,0.039439,0.0,0.0,0.0,0.904595,0.0,0.0,0.0,...,10.0,0.025281,10.0,0.0,0.0,1.01,0.217997,10.0,10.0,1.550076
100013,-95,0.0,0.205882,0.039439,0.0,0.0,0.0,0.904595,0.0,0.0,0.0,...,10.0,0.025281,10.0,0.0,0.0,1.01,0.217997,10.0,10.0,1.550076
100013,-94,0.0,0.205882,0.039439,0.0,0.0,0.0,0.904595,0.0,0.0,0.0,...,10.0,0.025281,10.0,0.0,0.0,1.01,0.217997,10.0,10.0,1.550076
100013,-93,0.0,0.205882,0.039439,0.0,0.0,0.0,0.904595,0.0,0.0,0.0,...,10.0,0.025281,10.0,0.0,0.0,1.01,0.217997,10.0,10.0,1.550076
100013,-92,0.0,0.205882,0.039439,0.0,0.0,0.0,0.904595,0.0,0.0,0.0,...,10.0,0.025281,10.0,0.0,0.0,1.01,0.217997,10.0,10.0,1.550076


Convert dataframe to 3D array (n_sample * n_time_step * n_features) for GRU network training.

In [7]:
train_x = ccbl_trn.to_panel().to_xarray().values
train_x = train_x.swapaxes(0,1).swapaxes(1,2)
test_x = ccbl_test.to_panel().to_xarray().values
test_x = test_x.swapaxes(0,1).swapaxes(1,2)
train_x[np.isnan(train_x)]=-9
test_x[np.isnan(test_x)]=-9
train_y = data_app['TARGET'].loc[data_app.SK_ID_CURR.isin(trn_id)]
train_x.shape, test_x.shape, train_y.shape

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  """Entry point for launching an IPython kernel.
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  This is separate from the ipykernel package so we can avoid doing imports until


((86905, 96, 32), (16653, 96, 32), (86905,))

Define GRU model. Use callback to evaluate auc metric.

In [9]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, GRU
from keras.regularizers import l2
from keras.optimizers import RMSprop, Adam

def build_model(time_step, n_features):
    model = Sequential()
    model.add(GRU(16, input_shape=(time_step, n_features))) #unit: #of neurons in each LSTM cell? input_shape=(time_step, n_features)
    model.add(Dense(1,activation='sigmoid'))
    return model

from keras.callbacks import Callback
from keras.callbacks import EarlyStopping
import logging

class IntervalEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == (self.interval-1):
            y_pred = self.model.predict(self.X_val, verbose=0)[:,0]
            score = roc_auc_score(self.y_val, y_pred)
            print('roc score',score)

Using TensorFlow backend.


Training...

In [10]:
# Run a 5 fold
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
oof_preds = np.zeros(train_x.shape[0])
sub_preds = np.zeros(test_x.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_x, train_y)):
    trn_x, val_x = train_x[trn_idx], train_x[val_idx]
    trn_y, val_y = train_y.values[trn_idx], train_y.values[val_idx]
    ival = IntervalEvaluation(validation_data=(val_x, val_y), interval=5)
    
    model = build_model(trn_x.shape[1],trn_x.shape[2])
    model.compile(loss='binary_crossentropy', optimizer=Adam(decay=0.001))
    model.fit(trn_x, trn_y,
              validation_data= [val_x, val_y],
              epochs=20, batch_size=3000, 
              class_weight = {0:1,1:10},
              callbacks=[ival], verbose=5)
    
    oof_preds[val_idx] = model.predict(val_x)[:,0]
    sub_preds += model.predict(test_x)[:,0] / folds.n_splits
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
  
    del model, trn_x, trn_y, val_x, val_y
    gc.collect()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 69523 samples, validate on 17382 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
roc score 0.6159685248368506
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
roc score 0.6186592541891123
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
roc score 0.6266124594412428
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
roc score 0.6279787657597877
Fold  1 AUC : 0.627979
Train on 69524 samples, validate on 17381 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
roc score 0.6124387690086639
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
roc score 0.620732829760308
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
roc score 0.6258351580742141
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
roc score 0.6268245771549158
Fold  2 AUC : 0.626825
Train on 69524 samples, validate on 17381

Save model prediction to disk.

In [11]:
cc_score_train = pd.DataFrame({'cc_score':oof_preds}, index=trn_id)
cc_score_test = pd.DataFrame({'cc_score':sub_preds}, index=test_id)             
cc_score_train.to_csv('cc_score_train.csv')
cc_score_test.to_csv('cc_score_test.csv')