##  E(a).  INSTALLMENT PAYMENT-TIME SERIES FEATURE EXTRACTION
Train GRU network on installment payment time series data. Save prediction to be used as features in final training.

In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
import gc

import os
print(os.listdir('C:/Users/monis/Desktop/Home credit risk'))
    
gc.enable()

['.ipynb_checkpoints', 'Automated Hyper parameter tuning.ipynb', 'baseline_lgb.csv', 'baseline_lgb_domain_features.csv', 'bayes_test.csv', 'H20AutoML.ipynb', 'HomeCreditRiskKaggle.ipynb', 'Kaggle competition.docx', 'LightGBM with Simple Features.ipynb', 'log_reg_baseline.csv', 'random_forest_baseline.csv', 'random_forest_baseline_domain.csv', 'sample_submission.csv']


Helper functions.

In [4]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype in ["int64"]]

    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)

    return df


def scale_data(df_):
    df = df_.copy(deep=True)
    for f_ in df_.columns:
        if (df[f_].max()- df[f_].min() <=10):
            df[f_] = df[f_] - df[f_].min()
            continue
        df[f_] = df[f_] - df[f_].median()
        scale = (df[f_].quantile(0.99)-df[f_].quantile(0.01))
        if scale==0:
            scale = df[f_].max() - df[f_].min()
        df[f_] = df[f_]/scale
        if df[f_].max()>10:
            rescale = df[f_]>df[f_].quantile(0.99)
            quantile99 = df[f_].quantile(0.99)
            quantile100 = df[f_].max()
            df[f_].loc[rescale] = quantile99 + (df[f_].loc[rescale] - quantile99) * (10-quantile99)/(quantile100-quantile99)
        if df[f_].min()<-10:
            rescale = df[f_]<df[f_].quantile(0.01)
            quantile1 = df[f_].quantile(0.01)
            quantile0 = df[f_].min()
            df[f_].loc[rescale] = quantile1 + (df[f_].loc[rescale] - quantile1) * (-10-quantile1)/(quantile0-quantile1)
        df[f_] = df[f_] - df[f_].min()
    return df

Read installment data and create features.

In [5]:
inst = pd.read_csv('installments_payments.csv')
inst['DAYS_ENTRY_PAYMENT_weighted'] = inst['DAYS_ENTRY_PAYMENT'] * inst['AMT_PAYMENT']
inst = inst.groupby(['SK_ID_PREV','SK_ID_CURR','NUM_INSTALMENT_NUMBER']).agg({
                                                                       'DAYS_INSTALMENT':'mean',
                                                                       'DAYS_ENTRY_PAYMENT_weighted':'sum',
                                                                       'AMT_INSTALMENT':'mean',
                                                                       'AMT_PAYMENT':'sum'})
inst['DAYS_ENTRY_PAYMENT'] = inst['DAYS_ENTRY_PAYMENT_weighted']/inst['AMT_PAYMENT']
inst = inst.reset_index()
del inst['DAYS_ENTRY_PAYMENT_weighted']
inst['AMT_PAYMENT_PERC'] = inst['AMT_PAYMENT'] / (1+inst['AMT_INSTALMENT'])
inst['DPD'] = (inst['DAYS_ENTRY_PAYMENT'] - inst['DAYS_INSTALMENT']).clip(lower=0)
inst['DBD'] = (inst['DAYS_INSTALMENT'] - inst['DAYS_ENTRY_PAYMENT']).clip(lower=0)
inst['MONTHS_BALANCE'] = (inst['DAYS_INSTALMENT']/30.4375).astype('int')
del inst['DAYS_ENTRY_PAYMENT'], inst['DAYS_INSTALMENT']
gc.collect()
#apply logarithm to make distribution more normal
inst['AMT_INSTALMENT_LOG'] = inst['AMT_INSTALMENT'].apply(np.log1p)
inst['AMT_PAYMENT_LOG'] = inst['AMT_PAYMENT'].apply(np.log1p)
inst[['AMT_INSTALMENT','AMT_PAYMENT']] = scale_data(inst[['AMT_INSTALMENT','AMT_PAYMENT']])
inst.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_NUMBER,AMT_INSTALMENT,AMT_PAYMENT,AMT_PAYMENT_PERC,DPD,DBD,MONTHS_BALANCE,AMT_INSTALMENT_LOG,AMT_PAYMENT_LOG
0,1000001,158271,1,0.044024,0.035582,0.999844,0.0,26.0,-8,8.764883,8.764883
1,1000001,158271,2,0.426463,0.344683,0.999984,0.0,6.0,-7,11.035536,11.035536
2,1000002,101962,1,0.043059,0.034802,0.99984,0.0,11.0,-52,8.742734,8.742734
3,1000002,101962,2,0.043059,0.034802,0.99984,0.0,5.0,-51,8.742734,8.742734
4,1000002,101962,3,0.043059,0.034802,0.99984,0.0,19.0,-50,8.742734,8.742734


Read target from main table.

In [6]:
data_app = pd.read_csv('application_train.csv',usecols=['SK_ID_CURR','TARGET'])
data_test = pd.read_csv('application_test.csv',usecols=['SK_ID_CURR'])
data_app.shape, data_test.shape

((307511, 2), (48744, 1))

In [7]:
trn_id = data_app['SK_ID_CURR'].loc[data_app.SK_ID_CURR.isin(inst.SK_ID_CURR)]
test_id = data_test['SK_ID_CURR'].loc[data_test['SK_ID_CURR'].isin(inst.SK_ID_CURR)]
trn_id.shape, test_id.shape

((291643,), (47944,))

Split train and test set. Group by ID and month to create time series.

In [8]:
inst_trn = inst.loc[inst.SK_ID_CURR.isin(trn_id)]
inst_test = inst.loc[inst.SK_ID_CURR.isin(test_id)]
num_aggregations = {
    'SK_ID_PREV': ['count'],
    'NUM_INSTALMENT_NUMBER': ['sum', 'max'],
    'AMT_INSTALMENT': ['sum','mean'],
    'AMT_PAYMENT': ['sum','mean'],
    'AMT_PAYMENT_PERC': ['mean','max'],
    'DPD': ['sum','max','mean'],
    'DBD': ['sum','max','mean'],
    'AMT_INSTALMENT_LOG': ['mean'],
    'AMT_PAYMENT_LOG': ['mean']
}
inst_trn = inst_trn.groupby(['SK_ID_CURR','MONTHS_BALANCE']).agg(num_aggregations)
inst_test = inst_test.groupby(['SK_ID_CURR','MONTHS_BALANCE']).agg(num_aggregations)
inst_trn.columns = pd.Index([e[0] + "_" + e[1].upper() for e in inst_trn.columns.tolist()])
inst_test.columns = pd.Index([e[0] + "_" + e[1].upper() for e in inst_test.columns.tolist()])

inst_trn = downcast_dtypes(inst_trn)
inst_test = downcast_dtypes(inst_test)
del inst
gc.collect()
inst_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SK_ID_PREV_COUNT,NUM_INSTALMENT_NUMBER_SUM,NUM_INSTALMENT_NUMBER_MAX,AMT_INSTALMENT_SUM,AMT_INSTALMENT_MEAN,AMT_PAYMENT_SUM,AMT_PAYMENT_MEAN,AMT_PAYMENT_PERC_MEAN,AMT_PAYMENT_PERC_MAX,DPD_SUM,DPD_MAX,DPD_MEAN,DBD_SUM,DBD_MAX,DBD_MEAN,AMT_INSTALMENT_LOG_MEAN,AMT_PAYMENT_LOG_MEAN
SK_ID_CURR,MONTHS_BALANCE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
100001,-95,1,2,2,0.027373,0.027373,0.022124,0.022124,0.999749,0.999749,0.0,0.0,0.0,0.0,0.0,0.0,8.289804,8.289804
100001,-94,1,3,3,0.027373,0.027373,0.022124,0.022124,0.999749,0.999749,11.0,11.0,11.0,0.0,0.0,0.0,8.289804,8.289804
100001,-93,1,4,4,0.027365,0.027365,0.022118,0.022118,0.999749,0.999749,0.0,0.0,0.0,0.0,0.0,0.0,8.28952,8.28952
100001,-56,1,1,1,0.02716,0.02716,0.021951,0.021951,0.999747,0.999747,0.0,0.0,0.0,6.0,6.0,6.0,8.281977,8.281977
100001,-55,1,2,2,0.02716,0.02716,0.021951,0.021951,0.999747,0.999747,0.0,0.0,0.0,36.0,36.0,36.0,8.281977,8.281977


Convert dataframe to 3D array (n_sample * n_time_step * n_features) for GRU network training.

In [9]:
train_x = inst_trn.to_panel().to_xarray().values
train_x = train_x.swapaxes(0,1).swapaxes(1,2)
test_x = inst_test.to_panel().to_xarray().values
test_x = test_x.swapaxes(0,1).swapaxes(1,2)
train_x[np.isnan(train_x)]=-9
test_x[np.isnan(test_x)]=-9
train_y = data_app['TARGET'].loc[data_app.SK_ID_CURR.isin(trn_id)]

# del inst_trn, inst_test
# gc.collect()

# train_x.shape, test_x.shape, train_y.shape

Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  """Entry point for launching an IPython kernel.
Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
del inst_trn, inst_test
gc.collect()

7

In [11]:
train_x.shape, test_x.shape, train_y.shape

((291643, 97, 17), (47944, 97, 17), (291643,))

### Define GRU model. Use callback to evaluate auc metric.

In [12]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, GRU
from keras.regularizers import l2
from keras.optimizers import RMSprop, Adam



Using TensorFlow backend.


In [13]:
def build_model(time_step, n_features):
    model = Sequential()
    model.add(GRU(8, input_shape=(time_step, n_features))) #unit: #of neurons in each LSTM cell? input_shape=(time_step, n_features)
    model.add(Dense(1,activation='sigmoid'))
    return model

from keras.callbacks import Callback
from keras.callbacks import EarlyStopping
import logging

class IntervalEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == (self.interval-1):
            y_pred = self.model.predict(self.X_val, verbose=0)[:,0]
            score = roc_auc_score(self.y_val, y_pred)
            print('roc score',score)
            

Training...

In [14]:
# Run a 5 fold
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
oof_preds = np.zeros(train_x.shape[0])
sub_preds = np.zeros(test_x.shape[0])


    

    
#     oof_preds[val_idx] = model.predict(val_x)[:,0]
#     sub_preds += model.predict(test_x)[:,0] / folds.n_splits
    
#     print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
  
#     del model, trn_x, trn_y, val_x, val_y
#     gc.collect()

In [15]:
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train_x, train_y)):
    trn_x, val_x = train_x[trn_idx], train_x[val_idx]
    trn_y, val_y = train_y.values[trn_idx], train_y.values[val_idx]
    ival = IntervalEvaluation(validation_data=(val_x, val_y), interval=5)

In [16]:
    model = build_model(trn_x.shape[1],trn_x.shape[2])
    model.compile(loss='binary_crossentropy', optimizer=Adam(decay=0.0001))
    model.fit(trn_x, trn_y,
              validation_data = [val_x, val_y],
              epochs=40, batch_size=8000, 
              class_weight = {0:1,1:10},
              callbacks=[ival], verbose=5)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 233315 samples, validate on 58328 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
roc score 0.5426625910447574
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
roc score 0.562305968261808
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
roc score 0.5718870291729394
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
roc score 0.5924490872329541
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
roc score 0.603628750419289
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
roc score 0.6104027707346336
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
roc score 0.612428975043855
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
roc score 0.6155440805668564


<keras.callbacks.History at 0x28824a52da0>

Save model prediction to disk.

In [17]:
inst_score_train = pd.DataFrame({'inst_score':oof_preds}, index=trn_id)
inst_score_test = pd.DataFrame({'inst_score':sub_preds}, index=test_id)             
inst_score_train.to_csv('inst_score_train.csv')
inst_score_test.to_csv('inst_score_test.csv')