previous_application.csv

All previous applications for Home Credit loans of clients who have loans in our sample.

There is one row for each previous application related to loans in our data sample.

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import LabelBinarizer, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

installments_payments.csv

Repayment history for the previously disbursed credits in Home Credit related to the loans in our sample.
There is 
a) one row for every payment that was made plus 
b) one row each for missed payment.
One row is equivalent to one payment of one installment OR one installment corresponding to one payment of one previous Home Credit credit related to loans in our sample.

In [2]:
installments = pd.read_csv('./data/rawdata/installments_payments.csv.zip',compression='zip')
installments['DPD'] = installments.DAYS_INSTALMENT - installments.DAYS_ENTRY_PAYMENT
installments['AMT_DPD'] = installments.AMT_INSTALMENT - installments.AMT_PAYMENT.fillna(0)
installments = installments.sort_values(['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_NUMBER'], ascending = [True,True,True])

In [3]:
installments.shape
installments.columns
installments.head(10000)

(13605401, 10)

Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_VERSION',
       'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT',
       'AMT_INSTALMENT', 'AMT_PAYMENT', 'DPD', 'AMT_DPD'],
      dtype='object')

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,DPD,AMT_DPD
1478621,1369693,100001,1.0,1,-1709.0,-1715.0,3951.000,3951.000,6.0,0.00
2568722,1369693,100001,1.0,2,-1679.0,-1715.0,3951.000,3951.000,36.0,0.00
3458712,1369693,100001,1.0,3,-1649.0,-1660.0,3951.000,3951.000,11.0,0.00
2624024,1369693,100001,2.0,4,-1619.0,-1628.0,17397.900,17397.900,9.0,0.00
1761012,1851984,100001,1.0,2,-2916.0,-2916.0,3982.050,3982.050,0.0,0.00
3774071,1851984,100001,1.0,3,-2886.0,-2875.0,3982.050,3982.050,-11.0,0.00
3435373,1851984,100001,1.0,4,-2856.0,-2856.0,3980.925,3980.925,0.0,0.00
2144879,1038818,100002,1.0,1,-565.0,-587.0,9251.775,9251.775,22.0,0.00
2163032,1038818,100002,1.0,2,-535.0,-562.0,9251.775,9251.775,27.0,0.00
1675768,1038818,100002,1.0,3,-505.0,-529.0,9251.775,9251.775,24.0,0.00


In [4]:

prev_installment_features = pd.concat([
    installments.NUM_INSTALMENT_VERSION.groupby(installments.SK_ID_PREV).nunique(),
    installments.NUM_INSTALMENT_VERSION.astype(str).groupby(installments.SK_ID_PREV).agg(lambda x: ','.join(set(x))),
    installments.NUM_INSTALMENT_VERSION.groupby(installments.SK_ID_PREV).agg(lambda x: list(x)[0]),
    
    installments.NUM_INSTALMENT_NUMBER.groupby(installments.SK_ID_PREV).agg(['min','max','count']),
    installments.NUM_INSTALMENT_NUMBER.groupby(installments.SK_ID_PREV).nunique(),
    
    installments.DAYS_INSTALMENT.groupby(installments.SK_ID_PREV).agg(['min','max','median', 'mean']),
    installments.DAYS_INSTALMENT.groupby(installments.SK_ID_PREV).agg(lambda x: np.max(x) - np.min(x)),
    
    installments.DAYS_ENTRY_PAYMENT.groupby(installments.SK_ID_PREV).agg(['min','max','median', 'mean']),
    installments.DAYS_ENTRY_PAYMENT.groupby(installments.SK_ID_PREV).agg(lambda x: np.max(x) - np.min(x)),
    
    installments.DPD.groupby(installments.SK_ID_PREV).agg(['min','max','median', 'mean','sum','std']),
    
    installments.AMT_INSTALMENT.groupby(installments.SK_ID_PREV).agg(['min','max','mean','sum','median','std']),
    installments.AMT_INSTALMENT.groupby(installments.SK_ID_PREV).nunique(),
    installments.AMT_PAYMENT.groupby(installments.SK_ID_PREV).agg(['min','max','mean','sum','median','std']),
    installments.AMT_PAYMENT.groupby(installments.SK_ID_PREV).nunique(),
    installments.AMT_DPD.groupby(installments.SK_ID_PREV).agg(['min','max','mean','sum','median','std']),
    installments.AMT_DPD.groupby(installments.SK_ID_PREV).nunique(),
    
], axis = 1)

prev_installment_features.columns = ['NUM_INSTALMENT_VERSION_nunique','NUM_INSTALMENT_VERSION_set','NUM_INSTALMENT_VERSION_latest']+ ['NUM_INSTALMENT_NUMBER_'+ n for n in ['min','max','count']] + ['NUM_INSTALMENT_NUMBER_nunique'] + ['DAYS_INSTALMENT_'+ n for n in ['min','max','median', 'mean']] + ['DAYS_INSTALMENT_range'] + ['DAYS_ENTRY_PAYMENT_'+ n for n in ['min','max','median', 'mean']] + ['DAYS_ENTRY_PAYMENT_range'] + ['DPD_'+ n for n in ['min','max','median', 'mean','sum','std']] + ['AMT_INSTALMENT_'+ n for n in ['min','max','mean','sum','median','std']] + ['AMT_INSTALMENT_nunique'] + ['AMT_PAYMENT_'+ n for n in ['min','max','mean','sum','median','std']] + ['AMT_PAYMENT_nunique'] + ['AMT_DPD_'+ n for n in ['min','max','mean','sum','median','std']] + ['AMT_DPD_nunique']

prev_installment_features = prev_installment_features.reset_index()



In [5]:
prev_installment_features.shape
prev_installment_features.head()

(997752, 45)

Unnamed: 0,SK_ID_PREV,NUM_INSTALMENT_VERSION_nunique,NUM_INSTALMENT_VERSION_set,NUM_INSTALMENT_VERSION_latest,NUM_INSTALMENT_NUMBER_min,NUM_INSTALMENT_NUMBER_max,NUM_INSTALMENT_NUMBER_count,NUM_INSTALMENT_NUMBER_nunique,DAYS_INSTALMENT_min,DAYS_INSTALMENT_max,...,AMT_PAYMENT_median,AMT_PAYMENT_std,AMT_PAYMENT_nunique,AMT_DPD_min,AMT_DPD_max,AMT_DPD_mean,AMT_DPD_sum,AMT_DPD_median,AMT_DPD_std,AMT_DPD_nunique
0,1000001,2,"1.0,2.0",1.0,1,2,2,2,-268.0,-238.0,...,34221.7125,39339.747885,2,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1000002,2,"1.0,2.0",1.0,1,4,4,4,-1600.0,-1510.0,...,6264.0,6089.7825,2,0.0,0.0,0.0,0.0,0.0,0.0,1
2,1000003,1,1.0,1.0,1,3,3,3,-94.0,-34.0,...,4951.35,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,1
3,1000004,2,"1.0,2.0",1.0,1,7,7,7,-862.0,-682.0,...,3391.11,3698.527885,2,0.0,0.0,0.0,0.0,0.0,0.0,1
4,1000005,1,1.0,1.0,1,10,11,10,-1688.0,-1418.0,...,14713.605,4432.07797,4,0.0,14710.815,1337.600455,14713.605,0.0,4435.393568,3


In [6]:
installment_features = pd.concat([
    installments.NUM_INSTALMENT_VERSION.groupby(installments.SK_ID_CURR).nunique(),
    installments.NUM_INSTALMENT_VERSION.astype(str).groupby(installments.SK_ID_CURR).agg(lambda x: ','.join(set(x))),
    installments.NUM_INSTALMENT_VERSION.groupby(installments.SK_ID_CURR).agg(lambda x: list(x)[0]),
    
    installments.NUM_INSTALMENT_NUMBER.groupby(installments.SK_ID_CURR).agg(['min','max','count']),
    installments.NUM_INSTALMENT_NUMBER.groupby(installments.SK_ID_CURR).nunique(),
    
    installments.DAYS_INSTALMENT.groupby(installments.SK_ID_CURR).agg(['min','max','median', 'mean']),
    installments.DAYS_INSTALMENT.groupby(installments.SK_ID_CURR).agg(lambda x: np.max(x) - np.min(x)),
    
    installments.DAYS_ENTRY_PAYMENT.groupby(installments.SK_ID_CURR).agg(['min','max','median', 'mean']),
    installments.DAYS_ENTRY_PAYMENT.groupby(installments.SK_ID_CURR).agg(lambda x: np.max(x) - np.min(x)),
    
    installments.DPD.groupby(installments.SK_ID_CURR).agg(['min','max','median', 'mean','sum','std']),
    
    installments.AMT_INSTALMENT.groupby(installments.SK_ID_CURR).agg(['min','max','mean','sum','median','std']),
    installments.AMT_INSTALMENT.groupby(installments.SK_ID_CURR).nunique(),
    installments.AMT_PAYMENT.groupby(installments.SK_ID_CURR).agg(['min','max','mean','sum','median','std']),
    installments.AMT_PAYMENT.groupby(installments.SK_ID_CURR).nunique(),
    installments.AMT_DPD.groupby(installments.SK_ID_CURR).agg(['min','max','mean','sum','median','std']),
    installments.AMT_DPD.groupby(installments.SK_ID_CURR).nunique(),
    
], axis = 1)

installment_features.columns = ['NUM_INSTALMENT_VERSION_nunique','NUM_INSTALMENT_VERSION_set','NUM_INSTALMENT_VERSION_latest']+ ['NUM_INSTALMENT_NUMBER_'+ n for n in ['min','max','count']] + ['NUM_INSTALMENT_NUMBER_nunique'] + ['DAYS_INSTALMENT_'+ n for n in ['min','max','median', 'mean']] + ['DAYS_INSTALMENT_range'] + ['DAYS_ENTRY_PAYMENT_'+ n for n in ['min','max','median', 'mean']] + ['DAYS_ENTRY_PAYMENT_range'] + ['DPD_'+ n for n in ['min','max','median', 'mean','sum','std']] + ['AMT_INSTALMENT_'+ n for n in ['min','max','mean','sum','median','std']] + ['AMT_INSTALMENT_nunique'] + ['AMT_PAYMENT_'+ n for n in ['min','max','mean','sum','median','std']] + ['AMT_PAYMENT_nunique'] + ['AMT_DPD_'+ n for n in ['min','max','mean','sum','median','std']] + ['AMT_DPD_nunique']

installment_features = installment_features.reset_index()


In [7]:
installment_features.shape
installment_features.head(50)

(339587, 45)

Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION_nunique,NUM_INSTALMENT_VERSION_set,NUM_INSTALMENT_VERSION_latest,NUM_INSTALMENT_NUMBER_min,NUM_INSTALMENT_NUMBER_max,NUM_INSTALMENT_NUMBER_count,NUM_INSTALMENT_NUMBER_nunique,DAYS_INSTALMENT_min,DAYS_INSTALMENT_max,...,AMT_PAYMENT_median,AMT_PAYMENT_std,AMT_PAYMENT_nunique,AMT_DPD_min,AMT_DPD_max,AMT_DPD_mean,AMT_DPD_sum,AMT_DPD_median,AMT_DPD_std,AMT_DPD_nunique
0,100001,2,"1.0,2.0",1.0,1,4,7,4,-2916.0,-1619.0,...,3980.925,5076.676624,4,0.0,0.0,0.0,0.0,0.0,0.0,1
1,100002,2,"1.0,2.0",1.0,1,19,19,19,-565.0,-25.0,...,9251.775,10058.037722,2,0.0,0.0,0.0,0.0,0.0,0.0,1
2,100003,2,"1.0,2.0",1.0,1,12,25,12,-2310.0,-536.0,...,64275.615,110542.5923,6,0.0,0.0,0.0,0.0,0.0,0.0,1
3,100004,2,"1.0,2.0",1.0,1,3,3,3,-784.0,-724.0,...,5357.25,3011.87181,2,0.0,0.0,0.0,0.0,0.0,0.0,1
4,100005,2,"1.0,2.0",1.0,1,9,9,9,-706.0,-466.0,...,4813.2,4281.015,2,0.0,0.0,0.0,0.0,0.0,0.0,1
5,100006,2,"1.0,2.0",2.0,1,10,16,10,-545.0,-11.0,...,29027.52,168097.624347,4,0.0,0.0,0.0,0.0,0.0,0.0,1
6,100007,2,"1.0,2.0",1.0,1,17,66,17,-2326.0,-14.0,...,16037.64,8048.060877,14,0.0,22655.655,452.384318,29857.365,0.0,2843.383508,7
7,100008,2,"1.0,2.0",1.0,1,10,35,10,-2491.0,-69.0,...,17876.115,70740.978283,9,0.0,11758.995,342.461571,11986.155,0.0,1986.874338,3
8,100009,1,1.0,1.0,1,12,51,12,-2908.0,-43.0,...,8996.76,3067.815701,15,0.0,0.0,0.0,0.0,0.0,0.0,1
9,100010,1,1.0,1.0,1,10,10,10,-1039.0,-769.0,...,27463.41,44.910667,2,0.0,0.0,0.0,0.0,0.0,0.0,1


In [8]:
prev_installment_features.to_csv('./data/rawdata/prev_installment_features.csv',index=False)
installment_features.to_csv('./data/rawdata/installment_features.csv',index=False)
