POS_CASH_balance.csv

Monthly balance snapshots of previous POS (point of sales) and cash loans that the applicant had with Home Credit.
This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credits * # of months in which we have some history observable for the previous credits) rows.



In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import LabelBinarizer, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
cash = pd.read_csv('./data/rawdata/POS_CASH_balance.csv.zip',compression='zip')
cash = cash.sort_values(['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE'], ascending = [True,True,False])
cash['SK_DPD_diff'] = cash.SK_DPD - cash.SK_DPD_DEF

In [3]:
cash.shape
cash.head(50)

(10001358, 9)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,SK_DPD_diff
2197888,1369693,100001,-53,4.0,0.0,Completed,0,0,0
4704415,1369693,100001,-54,4.0,1.0,Active,0,0,0
7823681,1369693,100001,-55,4.0,2.0,Active,0,0,0
8789081,1369693,100001,-56,4.0,3.0,Active,0,0,0
7167007,1369693,100001,-57,4.0,4.0,Active,0,0,0
4928574,1851984,100001,-93,4.0,0.0,Completed,0,0,0
8531326,1851984,100001,-94,4.0,0.0,Active,0,0,0
1891462,1851984,100001,-95,4.0,1.0,Active,7,7,0
1261679,1851984,100001,-96,4.0,2.0,Active,0,0,0
6626366,1038818,100002,-1,24.0,6.0,Active,0,0,0


In [4]:
# np.unique(cash.SK_DPD - cash.SK_DPD_DEF)
cash.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,SK_DPD_diff
2197888,1369693,100001,-53,4.0,0.0,Completed,0,0,0
4704415,1369693,100001,-54,4.0,1.0,Active,0,0,0
7823681,1369693,100001,-55,4.0,2.0,Active,0,0,0
8789081,1369693,100001,-56,4.0,3.0,Active,0,0,0
7167007,1369693,100001,-57,4.0,4.0,Active,0,0,0


In [5]:

prev_cash_features = pd.concat([
    cash.SK_ID_CURR.groupby(cash.SK_ID_PREV).agg('max'),
    cash.MONTHS_BALANCE.groupby(cash.SK_ID_PREV).agg(['min','max','count']),
    cash.CNT_INSTALMENT.groupby(cash.SK_ID_PREV).agg(['min','max']),
    cash.CNT_INSTALMENT.astype(str).groupby(cash.SK_ID_PREV).agg(lambda x: ','.join(set(x))),
    cash.CNT_INSTALMENT_FUTURE.groupby(cash.SK_ID_PREV).agg(['min','max']),
    cash.NAME_CONTRACT_STATUS.groupby(cash.SK_ID_PREV).nunique(),
    cash.NAME_CONTRACT_STATUS.groupby(cash.SK_ID_PREV).agg(lambda x: ','.join(set(x))),
    cash.NAME_CONTRACT_STATUS.groupby(cash.SK_ID_PREV).agg(lambda x: list(x)[0]),
    cash.SK_DPD.groupby(cash.SK_ID_PREV).agg('max'),
    cash.SK_DPD.groupby(cash.SK_ID_PREV).agg(lambda x: sum(x>0)),
    cash.SK_DPD_DEF.groupby(cash.SK_ID_PREV).agg('max'),
    cash.SK_DPD_DEF.groupby(cash.SK_ID_PREV).agg(lambda x: sum(x>0)),
    cash.SK_DPD_diff.groupby(cash.SK_ID_PREV).agg('max'),
    cash.SK_DPD_diff.groupby(cash.SK_ID_PREV).agg(lambda x: sum(x>0)),
],axis = 1)

prev_cash_features.columns = ['SK_ID_CURR','MONTHS_BALANCE_min','MONTHS_BALANCE_max','MONTHS_BALANCE_cnt',
                             'CNT_INSTALMENT_min','CNT_INSTALMENT_max','CNT_INSTALMENT_set','CNT_INSTALMENT_FUTURE_min',
                             'CNT_INSTALMENT_FUTURE_max','NAME_CONTRACT_STATUS_cntd','NAME_CONTRACT_STATUS_set',
                             'NAME_CONTRACT_STATUS_latest','SK_DPD_max','SK_DPD_cnt0','SK_DPD_DEF_max','SK_DPD_DEF_cnt0',
                             'SK_DPD_diff_max','SK_DPD_diff_cnt0']

prev_cash_features = prev_cash_features.reset_index()

In [6]:
prev_cash_features.head(50)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE_min,MONTHS_BALANCE_max,MONTHS_BALANCE_cnt,CNT_INSTALMENT_min,CNT_INSTALMENT_max,CNT_INSTALMENT_set,CNT_INSTALMENT_FUTURE_min,CNT_INSTALMENT_FUTURE_max,NAME_CONTRACT_STATUS_cntd,NAME_CONTRACT_STATUS_set,NAME_CONTRACT_STATUS_latest,SK_DPD_max,SK_DPD_cnt0,SK_DPD_DEF_max,SK_DPD_DEF_cnt0,SK_DPD_diff_max,SK_DPD_diff_cnt0
0,1000001,158271,-10,-8,3,2.0,12.0,"12.0,2.0",0.0,12.0,2,"Completed,Active",Completed,0,0,0,0,0,0
1,1000002,101962,-54,-50,5,4.0,6.0,"4.0,6.0",0.0,4.0,2,"Completed,Active",Completed,0,0,0,0,0,0
2,1000003,252457,-4,-1,4,12.0,12.0,12.0,9.0,12.0,1,Active,Active,0,0,0,0,0,0
3,1000004,260094,-29,-22,8,7.0,10.0,"7.0,10.0",0.0,10.0,2,"Completed,Active",Completed,0,0,0,0,0,0
4,1000005,176456,-56,-46,11,10.0,10.0,10.0,0.0,10.0,2,"Completed,Active",Completed,0,0,0,0,0,0
5,1000007,256657,-5,-1,5,6.0,6.0,6.0,2.0,6.0,1,Active,Active,0,0,0,0,0,0
6,1000008,152059,-43,-34,10,9.0,10.0,"9.0,10.0",0.0,10.0,2,"Completed,Active",Completed,0,0,0,0,0,0
7,1000009,343078,-16,-10,7,6.0,6.0,6.0,0.0,6.0,2,"Completed,Active",Completed,0,0,0,0,0,0
8,1000010,377567,-19,-9,11,11.0,18.0,"11.0,18.0",0.0,18.0,2,"Completed,Active",Completed,0,0,0,0,0,0
9,1000011,198678,-15,-3,13,12.0,12.0,12.0,0.0,12.0,2,"Completed,Active",Completed,0,0,0,0,0,0


In [7]:
cash_features = pd.concat([
    cash.MONTHS_BALANCE.groupby(cash.SK_ID_CURR).agg(['min','max','count']),
    cash.CNT_INSTALMENT.groupby(cash.SK_ID_CURR).agg(['min','max']),
    cash.CNT_INSTALMENT.astype(str).groupby(cash.SK_ID_CURR).agg(lambda x: ','.join(set(x))),
    cash.CNT_INSTALMENT_FUTURE.groupby(cash.SK_ID_CURR).agg(['min','max']),
    cash.NAME_CONTRACT_STATUS.groupby(cash.SK_ID_CURR).nunique(),
    cash.NAME_CONTRACT_STATUS.groupby(cash.SK_ID_CURR).agg(lambda x: ','.join(set(x))),
    prev_cash_features.NAME_CONTRACT_STATUS_latest.groupby(cash.SK_ID_CURR).agg(lambda x: ','.join(set(x))),
    cash.SK_DPD.groupby(cash.SK_ID_CURR).agg('max'),
    cash.SK_DPD.groupby(cash.SK_ID_CURR).agg(lambda x: sum(x>0)),
    cash.SK_DPD_DEF.groupby(cash.SK_ID_CURR).agg('max'),
    cash.SK_DPD_DEF.groupby(cash.SK_ID_CURR).agg(lambda x: sum(x>0)),
    cash.SK_DPD_diff.groupby(cash.SK_ID_CURR).agg('max'),
    cash.SK_DPD_diff.groupby(cash.SK_ID_CURR).agg(lambda x: sum(x>0)),
],axis = 1)

cash_features.columns = ['MONTHS_BALANCE_min','MONTHS_BALANCE_max','MONTHS_BALANCE_cnt',
                             'CNT_INSTALMENT_min','CNT_INSTALMENT_max','CNT_INSTALMENT_set','CNT_INSTALMENT_FUTURE_min',
                             'CNT_INSTALMENT_FUTURE_max','NAME_CONTRACT_STATUS_cntd','NAME_CONTRACT_STATUS_set',
                             'NAME_CONTRACT_STATUS_latest','SK_DPD_max','SK_DPD_cnt0','SK_DPD_DEF_max','SK_DPD_DEF_cnt0',
                             'SK_DPD_diff_max','SK_DPD_diff_cnt0']

cash_features = cash_features.reset_index()

In [8]:
cash_features.shape
cash_features.head(50)

(337252, 18)

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE_min,MONTHS_BALANCE_max,MONTHS_BALANCE_cnt,CNT_INSTALMENT_min,CNT_INSTALMENT_max,CNT_INSTALMENT_set,CNT_INSTALMENT_FUTURE_min,CNT_INSTALMENT_FUTURE_max,NAME_CONTRACT_STATUS_cntd,NAME_CONTRACT_STATUS_set,NAME_CONTRACT_STATUS_latest,SK_DPD_max,SK_DPD_cnt0,SK_DPD_DEF_max,SK_DPD_DEF_cnt0,SK_DPD_diff_max,SK_DPD_diff_cnt0
0,100001,-96,-53,9,4.0,4.0,4.0,0.0,4.0,2,"Completed,Active",,7,1,7,1,0,0
1,100002,-19,-1,19,24.0,24.0,24.0,6.0,24.0,1,Active,Active,0,0,0,0,0,0
2,100003,-77,-18,28,6.0,12.0,"12.0,7.0,6.0",0.0,12.0,2,"Completed,Active","Completed,Active",0,0,0,0,0,0
3,100004,-27,-24,4,3.0,4.0,"4.0,3.0",0.0,4.0,2,"Completed,Active",,0,0,0,0,0,0
4,100005,-25,-15,11,9.0,12.0,"12.0,9.0,nan",0.0,12.0,3,"Signed,Completed,Active",Completed,0,0,0,0,0,0
5,100006,-20,-1,21,1.0,48.0,"12.0,5.0,1.0,48.0,nan",0.0,48.0,3,"Returned to the store,Completed,Active",,0,0,0,0,0,0
6,100007,-77,-1,66,10.0,24.0,"12.0,24.0,17.0,10.0,18.0",0.0,24.0,3,"Signed,Completed,Active","Completed,Active",0,0,0,0,0,0
7,100008,-84,-2,83,6.0,30.0,"10.0,6.0,8.0,30.0",0.0,30.0,3,"Signed,Completed,Active",Completed,1294,43,0,0,1294,43
8,100009,-96,-1,64,5.0,12.0,"12.0,5.0,10.0,6.0",0.0,12.0,2,"Completed,Active",Completed,0,0,0,0,0,0
9,100010,-35,-25,11,10.0,10.0,10.0,0.0,10.0,2,"Completed,Active",Active,0,0,0,0,0,0


In [9]:
prev_cash_features.to_csv('./data/rawdata/prev_cash_features.csv',index=False)
cash_features.to_csv('./data/rawdata/cash_features.csv',index=False)



credit_card_balance.csv

Monthly balance snapshots of previous credit cards that the applicant has with Home Credit.
This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credit cards * # of months where we have some history observable for the previous credit card) rows.


In [10]:
credit = pd.read_csv('./data/rawdata/credit_card_balance.csv.zip',compression='zip')

credit = credit.sort_values(['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE'], ascending = [True,True,False])
credit['SK_DPD_diff'] = credit.SK_DPD - credit.SK_DPD_DEF

In [11]:
credit.shape
credit.head(50)

(3840312, 24)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,SK_DPD_diff
584804,1489396,100006,-1,0.0,270000,,0.0,,,0.0,...,0.0,,0,,,0.0,Active,0,0,0
520387,1489396,100006,-2,0.0,270000,,0.0,,,0.0,...,0.0,,0,,,0.0,Active,0,0,0
1347528,1489396,100006,-3,0.0,270000,,0.0,,,0.0,...,0.0,,0,,,0.0,Active,0,0,0
1399895,1489396,100006,-4,0.0,270000,,0.0,,,0.0,...,0.0,,0,,,0.0,Active,0,0,0
655566,1489396,100006,-5,0.0,270000,,0.0,,,0.0,...,0.0,,0,,,0.0,Active,0,0,0
1636141,1489396,100006,-6,0.0,270000,,0.0,,,0.0,...,0.0,,0,,,0.0,Active,0,0,0
2739019,1843384,100011,-2,0.0,90000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,33.0,Active,0,0,0
3496910,1843384,100011,-3,0.0,90000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,33.0,Active,0,0,0
51047,1843384,100011,-4,0.0,90000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,33.0,Active,0,0,0
2674883,1843384,100011,-5,0.0,90000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,33.0,Active,0,0,0


In [12]:

prev_credit_features = pd.concat([
    credit.SK_ID_CURR.groupby(credit.SK_ID_PREV).agg('max'),
    credit.MONTHS_BALANCE.groupby(credit.SK_ID_PREV).agg(['min','max','count']),
    credit.AMT_BALANCE.groupby(credit.SK_ID_PREV).agg(lambda x: np.min(x[x>0])),
    credit.AMT_BALANCE.groupby(credit.SK_ID_PREV).agg(['max','sum']),
    credit.AMT_BALANCE.groupby(credit.SK_ID_PREV).agg(lambda x: np.mean(x[x>0])),
    credit.AMT_BALANCE.groupby(credit.SK_ID_PREV).agg(lambda x: sum(x>0)),
    credit.AMT_CREDIT_LIMIT_ACTUAL.groupby(credit.SK_ID_PREV).agg(lambda x: np.min(x[x>0])),
    credit.AMT_CREDIT_LIMIT_ACTUAL.groupby(credit.SK_ID_PREV).agg('max'),
    credit.AMT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.min(x[x>0])),
    credit.AMT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: sum(x>0)),
    credit.AMT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.mean(x[x>0])),
    credit.AMT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.AMT_DRAWINGS_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.min(x[x>0])),
    credit.AMT_DRAWINGS_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: sum(x>0)),
    credit.AMT_DRAWINGS_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.mean(x[x>0])),
    credit.AMT_DRAWINGS_CURRENT.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.AMT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.min(x[x>0])),
    credit.AMT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: sum(x>0)),
    credit.AMT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.mean(x[x>0])),
    credit.AMT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.AMT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.min(x[x>0])),
    credit.AMT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: sum(x>0)),
    credit.AMT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.mean(x[x>0])),
    credit.AMT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.AMT_INST_MIN_REGULARITY.groupby(credit.SK_ID_PREV).agg(lambda x: np.min(x[x>0])),
    credit.AMT_INST_MIN_REGULARITY.groupby(credit.SK_ID_PREV).agg('max'),
    credit.AMT_INST_MIN_REGULARITY.groupby(credit.SK_ID_PREV).nunique(),
    credit.AMT_PAYMENT_CURRENT.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.AMT_PAYMENT_CURRENT.groupby(credit.SK_ID_PREV).nunique(),
    credit.AMT_PAYMENT_TOTAL_CURRENT.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.AMT_PAYMENT_TOTAL_CURRENT.groupby(credit.SK_ID_PREV).nunique(),
    credit.AMT_RECEIVABLE_PRINCIPAL.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.AMT_RECEIVABLE_PRINCIPAL.groupby(credit.SK_ID_PREV).nunique(),
    credit.AMT_RECIVABLE.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.AMT_RECIVABLE.groupby(credit.SK_ID_PREV).nunique(),
    credit.AMT_TOTAL_RECEIVABLE.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.AMT_TOTAL_RECEIVABLE.groupby(credit.SK_ID_PREV).nunique(),
    credit.CNT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.CNT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.mean(x[x>0])),
    credit.CNT_DRAWINGS_CURRENT.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.CNT_DRAWINGS_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.mean(x[x>0])),
    credit.CNT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.CNT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.mean(x[x>0])),
    credit.CNT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_PREV).agg(['sum','max']),
    credit.CNT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_PREV).agg(lambda x: np.mean(x[x>0])),
    credit.CNT_INSTALMENT_MATURE_CUM.groupby(credit.SK_ID_PREV).nunique(),
    credit.CNT_INSTALMENT_MATURE_CUM.astype(str).groupby(credit.SK_ID_PREV).agg(lambda x: ','.join(set(x))),
    credit.CNT_INSTALMENT_MATURE_CUM.astype(str).groupby(credit.SK_ID_PREV).agg(lambda x: list(x)[0]),
    credit.NAME_CONTRACT_STATUS.groupby(credit.SK_ID_PREV).nunique(),
    credit.NAME_CONTRACT_STATUS.astype(str).groupby(credit.SK_ID_PREV).agg(lambda x: ','.join(set(x))),
    credit.SK_DPD.groupby(credit.SK_ID_PREV).agg('max'),
    credit.SK_DPD.groupby(credit.SK_ID_PREV).agg(lambda x: sum(x>0)),
    credit.SK_DPD_DEF.groupby(credit.SK_ID_PREV).agg('max'),
    credit.SK_DPD_DEF.groupby(credit.SK_ID_PREV).agg(lambda x: sum(x>0)),
    credit.SK_DPD_diff.groupby(credit.SK_ID_PREV).agg('max'),
    credit.SK_DPD_diff.groupby(credit.SK_ID_PREV).agg(lambda x: sum(x>0)),
], axis = 1)
 
prev_credit_features.columns = ['SK_ID_CURR','MONTHS_BALANCE_min','MONTHS_BALANCE_max','MONTHS_BALANCE_count','AMT_BALANCE_min','AMT_BALANCE_max', 'AMT_BALANCE_sum','AMT_BALANCE_mean','AMT_BALANCE_size','AMT_CREDIT_LIMIT_ACTUAL_min','AMT_CREDIT_LIMIT_ACTUAL_max','AMT_DRAWINGS_ATM_CURRENT_min','AMT_DRAWINGS_ATM_CURRENT_size','AMT_DRAWINGS_ATM_CURRENT_mean','AMT_DRAWINGS_ATM_CURRENT_sum','AMT_DRAWINGS_ATM_CURRENT_max','AMT_DRAWINGS_CURRENT_min','AMT_DRAWINGS_CURRENT_size','AMT_DRAWINGS_CURRENT_mean','AMT_DRAWINGS_CURRENT_sum','AMT_DRAWINGS_CURRENT_max','AMT_DRAWINGS_OTHER_CURRENT_min','AMT_DRAWINGS_OTHER_CURRENT_size','AMT_DRAWINGS_OTHER_CURRENT_mean','AMT_DRAWINGS_OTHER_CURRENT_sum','AMT_DRAWINGS_OTHER_CURRENT_max','AMT_DRAWINGS_POS_CURRENT_min','AMT_DRAWINGS_POS_CURRENT_size','AMT_DRAWINGS_POS_CURRENT_mean','AMT_DRAWINGS_POS_CURRENT_sum','AMT_DRAWINGS_POS_CURRENT_max','AMT_INST_MIN_REGULARITY_min','AMT_INST_MIN_REGULARITY_max','AMT_INST_MIN_REGULARITY_cntd','AMT_PAYMENT_CURRENT_sum','AMT_PAYMENT_CURRENT_max','AMT_PAYMENT_CURRENT_cntd','AMT_PAYMENT_TOTAL_CURRENT_sum','AMT_PAYMENT_TOTAL_CURRENT_max','AMT_PAYMENT_TOTAL_CURRENT_cntd','AMT_RECEIVABLE_PRINCIPAL_sum','AMT_RECEIVABLE_PRINCIPAL_max','AMT_RECEIVABLE_PRINCIPAL_cntd','AMT_RECIVABLE_sum','AMT_RECIVABLE_max','AMT_RECIVABLE_cntd','AMT_TOTAL_RECEIVABLE_sum','AMT_TOTAL_RECEIVABLE_max','AMT_TOTAL_RECEIVABLE_cntd','CNT_DRAWINGS_ATM_CURRENT_sum','CNT_DRAWINGS_ATM_CURRENT_max','CNT_DRAWINGS_ATM_CURRENT_mean','CNT_DRAWINGS_CURRENT_sum','CNT_DRAWINGS_CURRENT_max','CNT_DRAWINGS_CURRENT_mean','CNT_DRAWINGS_OTHER_CURRENT_sum','CNT_DRAWINGS_OTHER_CURRENT_max','CNT_DRAWINGS_OTHER_CURRENT_mean','CNT_DRAWINGS_POS_CURRENT_sum','CNT_DRAWINGS_POS_CURRENT_max','CNT_DRAWINGS_POS_CURRENT_mean','CNT_INSTALMENT_MATURE_CUM_cntd','CNT_INSTALMENT_MATURE_CUM_set','CNT_INSTALMENT_MATURE_CUM_latest','NAME_CONTRACT_STATUS_cntd','NAME_CONTRACT_STATUS_set','SK_DPD_max','SK_DPD_size','SK_DPD_DEF_max','SK_DPD_DEF_size','SK_DPD_diff_max','SK_DPD_diff_size']
prev_credit_features = prev_credit_features.reset_index()



In [13]:
prev_credit_features.shape
prev_credit_features.columns
prev_credit_features.head(50)

(104307, 73)

Index(['SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE_min', 'MONTHS_BALANCE_max',
       'MONTHS_BALANCE_count', 'AMT_BALANCE_min', 'AMT_BALANCE_max',
       'AMT_BALANCE_sum', 'AMT_BALANCE_mean', 'AMT_BALANCE_size',
       'AMT_CREDIT_LIMIT_ACTUAL_min', 'AMT_CREDIT_LIMIT_ACTUAL_max',
       'AMT_DRAWINGS_ATM_CURRENT_min', 'AMT_DRAWINGS_ATM_CURRENT_size',
       'AMT_DRAWINGS_ATM_CURRENT_mean', 'AMT_DRAWINGS_ATM_CURRENT_sum',
       'AMT_DRAWINGS_ATM_CURRENT_max', 'AMT_DRAWINGS_CURRENT_min',
       'AMT_DRAWINGS_CURRENT_size', 'AMT_DRAWINGS_CURRENT_mean',
       'AMT_DRAWINGS_CURRENT_sum', 'AMT_DRAWINGS_CURRENT_max',
       'AMT_DRAWINGS_OTHER_CURRENT_min', 'AMT_DRAWINGS_OTHER_CURRENT_size',
       'AMT_DRAWINGS_OTHER_CURRENT_mean', 'AMT_DRAWINGS_OTHER_CURRENT_sum',
       'AMT_DRAWINGS_OTHER_CURRENT_max', 'AMT_DRAWINGS_POS_CURRENT_min',
       'AMT_DRAWINGS_POS_CURRENT_size', 'AMT_DRAWINGS_POS_CURRENT_mean',
       'AMT_DRAWINGS_POS_CURRENT_sum', 'AMT_DRAWINGS_POS_CURRENT_max',
       'A

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE_min,MONTHS_BALANCE_max,MONTHS_BALANCE_count,AMT_BALANCE_min,AMT_BALANCE_max,AMT_BALANCE_sum,AMT_BALANCE_mean,AMT_BALANCE_size,...,CNT_INSTALMENT_MATURE_CUM_set,CNT_INSTALMENT_MATURE_CUM_latest,NAME_CONTRACT_STATUS_cntd,NAME_CONTRACT_STATUS_set,SK_DPD_max,SK_DPD_size,SK_DPD_DEF_max,SK_DPD_DEF_size,SK_DPD_diff_max,SK_DPD_diff_size
0,1000018,394447,-6,-2,5,38879.145,136695.42,374731.425,74946.285,5.0,...,"4.0,1.0,3.0,2.0,0.0",4.0,1,Active,0,0,0,0,0,0
1,1000030,361282,-8,-1,8,15583.635,103027.275,447928.515,63989.787857,7.0,...,"4.0,1.0,5.0,3.0,2.0,0.0",5.0,1,Active,0,0,0,0,0,0
2,1000031,131335,-16,-1,16,5805.495,154945.935,838311.03,104788.87875,8.0,...,"4.0,9.0,5.0,1.0,8.0,7.0,6.0,3.0,2.0,0.0,10.0",10.0,1,Active,0,0,0,0,0,0
3,1000035,436351,-6,-2,5,,0.0,0.0,,0.0,...,0.0,0.0,1,Active,0,0,0,0,0,0
4,1000077,181153,-12,-2,11,,0.0,0.0,,0.0,...,0.0,0.0,1,Active,0,0,0,0,0,0
5,1000083,309691,-13,-1,13,,0.0,0.0,,0.0,...,0.0,0.0,1,Active,0,0,0,0,0,0
6,1000087,399664,-32,-1,32,958.725,136005.75,1250499.33,113681.757273,11.0,...,"4.0,9.0,5.0,1.0,8.0,nan,7.0,11.0,6.0,3.0,2.0,10.0",11.0,1,Active,0,0,0,0,0,0
7,1000089,161517,-5,-1,5,,0.0,0.0,,0.0,...,0.0,0.0,1,Completed,0,0,0,0,0,0
8,1000094,359175,-89,-2,88,202.95,72248.805,2583953.145,53832.357187,48.0,...,"9.0,47.0,26.0,27.0,1.0,19.0,3.0,36.0,38.0,45.0...",48.0,2,"Completed,Active",1,1,1,1,0,0
9,1000096,306118,-96,-1,96,67.5,190845.945,3705437.925,105869.655,35.0,...,"9.0,26.0,27.0,19.0,36.0,38.0,12.0,39.0,20.0,42...",42.0,2,"Completed,Active",31,4,0,0,31,4


In [14]:
credit_features = pd.concat([
    credit.MONTHS_BALANCE.groupby(credit.SK_ID_CURR).agg(['min','max','count']),
    credit.AMT_BALANCE.groupby(credit.SK_ID_CURR).agg(lambda x: np.min(x[x>0])),
    credit.AMT_BALANCE.groupby(credit.SK_ID_CURR).agg(['max','sum']),
    credit.AMT_BALANCE.groupby(credit.SK_ID_CURR).agg(lambda x: np.mean(x[x>0])),
    credit.AMT_BALANCE.groupby(credit.SK_ID_CURR).agg(lambda x: sum(x>0)),
    credit.AMT_CREDIT_LIMIT_ACTUAL.groupby(credit.SK_ID_CURR).agg(lambda x: np.min(x[x>0])),
    credit.AMT_CREDIT_LIMIT_ACTUAL.groupby(credit.SK_ID_CURR).agg('max'),
    credit.AMT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.min(x[x>0])),
    credit.AMT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: sum(x>0)),
    credit.AMT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.mean(x[x>0])),
    credit.AMT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.AMT_DRAWINGS_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.min(x[x>0])),
    credit.AMT_DRAWINGS_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: sum(x>0)),
    credit.AMT_DRAWINGS_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.mean(x[x>0])),
    credit.AMT_DRAWINGS_CURRENT.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.AMT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.min(x[x>0])),
    credit.AMT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: sum(x>0)),
    credit.AMT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.mean(x[x>0])),
    credit.AMT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.AMT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.min(x[x>0])),
    credit.AMT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: sum(x>0)),
    credit.AMT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.mean(x[x>0])),
    credit.AMT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.AMT_INST_MIN_REGULARITY.groupby(credit.SK_ID_CURR).agg(lambda x: np.min(x[x>0])),
    credit.AMT_INST_MIN_REGULARITY.groupby(credit.SK_ID_CURR).agg('max'),
    credit.AMT_INST_MIN_REGULARITY.groupby(credit.SK_ID_CURR).nunique(),
    credit.AMT_PAYMENT_CURRENT.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.AMT_PAYMENT_CURRENT.groupby(credit.SK_ID_CURR).nunique(),
    credit.AMT_PAYMENT_TOTAL_CURRENT.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.AMT_PAYMENT_TOTAL_CURRENT.groupby(credit.SK_ID_CURR).nunique(),
    credit.AMT_RECEIVABLE_PRINCIPAL.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.AMT_RECEIVABLE_PRINCIPAL.groupby(credit.SK_ID_CURR).nunique(),
    credit.AMT_RECIVABLE.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.AMT_RECIVABLE.groupby(credit.SK_ID_CURR).nunique(),
    credit.AMT_TOTAL_RECEIVABLE.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.AMT_TOTAL_RECEIVABLE.groupby(credit.SK_ID_CURR).nunique(),
    credit.CNT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.CNT_DRAWINGS_ATM_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.mean(x[x>0])),
    credit.CNT_DRAWINGS_CURRENT.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.CNT_DRAWINGS_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.mean(x[x>0])),
    credit.CNT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.CNT_DRAWINGS_OTHER_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.mean(x[x>0])),
    credit.CNT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_CURR).agg(['sum','max']),
    credit.CNT_DRAWINGS_POS_CURRENT.groupby(credit.SK_ID_CURR).agg(lambda x: np.mean(x[x>0])),
    credit.CNT_INSTALMENT_MATURE_CUM.groupby(credit.SK_ID_CURR).nunique(),
    credit.CNT_INSTALMENT_MATURE_CUM.astype(str).groupby(credit.SK_ID_CURR).agg(lambda x: ','.join(set(x))),
    prev_credit_features.CNT_INSTALMENT_MATURE_CUM_latest.astype(str).groupby(prev_credit_features.SK_ID_CURR).agg(lambda x: ','.join(set(x))),
    credit.NAME_CONTRACT_STATUS.groupby(credit.SK_ID_CURR).nunique(),
    credit.NAME_CONTRACT_STATUS.astype(str).groupby(credit.SK_ID_CURR).agg(lambda x: ','.join(set(x))),
    credit.SK_DPD.groupby(credit.SK_ID_CURR).agg('max'),
    credit.SK_DPD.groupby(credit.SK_ID_CURR).agg(lambda x: sum(x>0)),
    credit.SK_DPD_DEF.groupby(credit.SK_ID_CURR).agg('max'),
    credit.SK_DPD_DEF.groupby(credit.SK_ID_CURR).agg(lambda x: sum(x>0)),
    credit.SK_DPD_diff.groupby(credit.SK_ID_CURR).agg('max'),
    credit.SK_DPD_diff.groupby(credit.SK_ID_CURR).agg(lambda x: sum(x>0)),
], axis = 1)

credit_features.columns = ['MONTHS_BALANCE_min','MONTHS_BALANCE_max','MONTHS_BALANCE_count','AMT_BALANCE_min','AMT_BALANCE_max', 'AMT_BALANCE_sum','AMT_BALANCE_mean','AMT_BALANCE_size','AMT_CREDIT_LIMIT_ACTUAL_min','AMT_CREDIT_LIMIT_ACTUAL_max','AMT_DRAWINGS_ATM_CURRENT_min','AMT_DRAWINGS_ATM_CURRENT_size','AMT_DRAWINGS_ATM_CURRENT_mean','AMT_DRAWINGS_ATM_CURRENT_sum','AMT_DRAWINGS_ATM_CURRENT_max','AMT_DRAWINGS_CURRENT_min','AMT_DRAWINGS_CURRENT_size','AMT_DRAWINGS_CURRENT_mean','AMT_DRAWINGS_CURRENT_sum','AMT_DRAWINGS_CURRENT_max','AMT_DRAWINGS_OTHER_CURRENT_min','AMT_DRAWINGS_OTHER_CURRENT_size','AMT_DRAWINGS_OTHER_CURRENT_mean','AMT_DRAWINGS_OTHER_CURRENT_sum','AMT_DRAWINGS_OTHER_CURRENT_max','AMT_DRAWINGS_POS_CURRENT_min','AMT_DRAWINGS_POS_CURRENT_size','AMT_DRAWINGS_POS_CURRENT_mean','AMT_DRAWINGS_POS_CURRENT_sum','AMT_DRAWINGS_POS_CURRENT_max','AMT_INST_MIN_REGULARITY_min','AMT_INST_MIN_REGULARITY_max','AMT_INST_MIN_REGULARITY_cntd','AMT_PAYMENT_CURRENT_sum','AMT_PAYMENT_CURRENT_max','AMT_PAYMENT_CURRENT_cntd','AMT_PAYMENT_TOTAL_CURRENT_sum','AMT_PAYMENT_TOTAL_CURRENT_max','AMT_PAYMENT_TOTAL_CURRENT_cntd','AMT_RECEIVABLE_PRINCIPAL_sum','AMT_RECEIVABLE_PRINCIPAL_max','AMT_RECEIVABLE_PRINCIPAL_cntd','AMT_RECIVABLE_sum','AMT_RECIVABLE_max','AMT_RECIVABLE_cntd','AMT_TOTAL_RECEIVABLE_sum','AMT_TOTAL_RECEIVABLE_max','AMT_TOTAL_RECEIVABLE_cntd','CNT_DRAWINGS_ATM_CURRENT_sum','CNT_DRAWINGS_ATM_CURRENT_max','CNT_DRAWINGS_ATM_CURRENT_mean','CNT_DRAWINGS_CURRENT_sum','CNT_DRAWINGS_CURRENT_max','CNT_DRAWINGS_CURRENT_mean','CNT_DRAWINGS_OTHER_CURRENT_sum','CNT_DRAWINGS_OTHER_CURRENT_max','CNT_DRAWINGS_OTHER_CURRENT_mean','CNT_DRAWINGS_POS_CURRENT_sum','CNT_DRAWINGS_POS_CURRENT_max','CNT_DRAWINGS_POS_CURRENT_mean','CNT_INSTALMENT_MATURE_CUM_cntd','CNT_INSTALMENT_MATURE_CUM_set','CNT_INSTALMENT_MATURE_CUM_latest','NAME_CONTRACT_STATUS_cntd','NAME_CONTRACT_STATUS_set','SK_DPD_max','SK_DPD_size','SK_DPD_DEF_max','SK_DPD_DEF_size','SK_DPD_diff_max','SK_DPD_diff_size']
credit_features = credit_features.reset_index()



In [15]:
credit_features.shape
credit_features.columns
credit_features.head(50)

(103558, 72)

Index(['SK_ID_CURR', 'MONTHS_BALANCE_min', 'MONTHS_BALANCE_max',
       'MONTHS_BALANCE_count', 'AMT_BALANCE_min', 'AMT_BALANCE_max',
       'AMT_BALANCE_sum', 'AMT_BALANCE_mean', 'AMT_BALANCE_size',
       'AMT_CREDIT_LIMIT_ACTUAL_min', 'AMT_CREDIT_LIMIT_ACTUAL_max',
       'AMT_DRAWINGS_ATM_CURRENT_min', 'AMT_DRAWINGS_ATM_CURRENT_size',
       'AMT_DRAWINGS_ATM_CURRENT_mean', 'AMT_DRAWINGS_ATM_CURRENT_sum',
       'AMT_DRAWINGS_ATM_CURRENT_max', 'AMT_DRAWINGS_CURRENT_min',
       'AMT_DRAWINGS_CURRENT_size', 'AMT_DRAWINGS_CURRENT_mean',
       'AMT_DRAWINGS_CURRENT_sum', 'AMT_DRAWINGS_CURRENT_max',
       'AMT_DRAWINGS_OTHER_CURRENT_min', 'AMT_DRAWINGS_OTHER_CURRENT_size',
       'AMT_DRAWINGS_OTHER_CURRENT_mean', 'AMT_DRAWINGS_OTHER_CURRENT_sum',
       'AMT_DRAWINGS_OTHER_CURRENT_max', 'AMT_DRAWINGS_POS_CURRENT_min',
       'AMT_DRAWINGS_POS_CURRENT_size', 'AMT_DRAWINGS_POS_CURRENT_mean',
       'AMT_DRAWINGS_POS_CURRENT_sum', 'AMT_DRAWINGS_POS_CURRENT_max',
       'AMT_INST_MIN_RE

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE_min,MONTHS_BALANCE_max,MONTHS_BALANCE_count,AMT_BALANCE_min,AMT_BALANCE_max,AMT_BALANCE_sum,AMT_BALANCE_mean,AMT_BALANCE_size,AMT_CREDIT_LIMIT_ACTUAL_min,...,CNT_INSTALMENT_MATURE_CUM_set,CNT_INSTALMENT_MATURE_CUM_latest,NAME_CONTRACT_STATUS_cntd,NAME_CONTRACT_STATUS_set,SK_DPD_max,SK_DPD_size,SK_DPD_DEF_max,SK_DPD_DEF_size,SK_DPD_diff_max,SK_DPD_diff_size
0,100006,-6,-1,6,,0.0,0.0,,0.0,270000.0,...,0.0,0.0,1,Active,0,0,0,0,0,0
1,100011,-75,-2,74,804.195,189000.0,4031676.0,122172.006818,33.0,90000.0,...,"9.0,26.0,27.0,1.0,19.0,33.0,3.0,12.0,20.0,17.0...",33.0,1,Active,0,0,0,0,0,0
2,100013,-96,-1,96,67.5,161420.22,1743352.0,79243.283864,22.0,45000.0,...,"9.0,1.0,19.0,3.0,12.0,20.0,17.0,7.0,2.0,10.0,4...",22.0,1,Active,1,1,1,1,0,0
3,100021,-18,-2,17,,0.0,0.0,,0.0,675000.0,...,0.0,0.0,2,"Completed,Active",0,0,0,0,0,0
4,100023,-11,-4,8,,0.0,0.0,,0.0,45000.0,...,0.0,0.0,1,Active,0,0,0,0,0,0
5,100028,-49,-1,49,45.585,37335.915,396167.9,10707.239189,37.0,225000.0,...,"9.0,26.0,27.0,1.0,19.0,33.0,3.0,12.0,20.0,17.0...",35.0,1,Active,0,0,0,0,0,0
6,100036,-13,-2,12,,0.0,0.0,,0.0,45000.0,...,0.0,0.0,1,Active,0,0,0,0,0,0
7,100042,-84,-1,84,763.965,93118.455,2801919.0,71844.086538,39.0,45000.0,...,"9.0,26.0,27.0,1.0,19.0,33.0,3.0,36.0,38.0,12.0...",39.0,1,Active,1,2,0,0,1,2
8,100043,-33,-1,33,281.16,435861.585,6882896.0,344144.79,20.0,22500.0,...,"9.0,1.0,19.0,3.0,12.0,20.0,17.0,7.0,2.0,10.0,4...",20.0,1,Active,0,0,0,0,0,0
9,100047,-39,-4,36,,0.0,0.0,,0.0,45000.0,...,"nan,0.0",0.0,2,"Signed,Active",0,0,0,0,0,0


In [16]:
prev_credit_features.to_csv('./data/rawdata/prev_credit_features.csv',index=False)
credit_features.to_csv('./data/rawdata/credit_features.csv',index=False)
