In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import LabelBinarizer, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
# train = pd.read_csv('./data/rawdata/application_train.csv.zip',compression='zip')
# test = pd.read_csv('./data/rawdata/application_test.csv.zip',compression='zip')

# pre = pd.read_csv('./data/rawdata/previous_application.csv.zip',compression='zip')
bureau = pd.read_csv('./data/rawdata/bureau.csv.zip',compression='zip')
bureau_balance = pd.read_csv('./data/rawdata/bureau_balance.csv.zip',compression='zip')


bureau.csv

All client's previous credits provided by other financial institutions that were reported to Credit Bureau (for clients who have a loan in our sample).

For every loan in our sample, there are as many rows as number of credits the client had in Credit Bureau before the application date.

bureau_balance.csv

Monthly balances of previous credits in Credit Bureau.

This table has one row for each month of history of every previous credit reported to Credit Bureau – i.e the table has (#loans in sample * # of relative previous credits * # of months where we have some history observable for the previous credits) rows.

bureau_balance.csv,STATUS,"Status of Credit Bureau loan during the month (active, closed, DPD0-30,… [
C means closed, 
X means status unknown, 
0 means no DPD, 
1 means maximal did during month between 1-30, 
2 means DPD 31-60,… 
5 means DPD 120+ or sold or written off ] )",

In [3]:
# train (307511, 122)
# test (48744, 121)
bureau.shape
bureau.sort_values('SK_ID_CURR').head(10)

bureau_balance.shape
bureau_balance.head(50)


(1716428, 17)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
248487,100001,5896633,Closed,currency 1,-1572,0,-1329.0,-1328.0,,0,85500.0,0.0,0.0,0.0,Consumer credit,-155,0.0
248485,100001,5896631,Closed,currency 1,-909,0,-179.0,-877.0,,0,279720.0,0.0,0.0,0.0,Consumer credit,-155,0.0
248490,100001,5896636,Active,currency 1,-320,0,411.0,,,0,168345.0,110281.5,,0.0,Consumer credit,-10,9364.5
248489,100001,5896635,Active,currency 1,-49,0,1778.0,,,0,378000.0,373239.0,0.0,0.0,Consumer credit,-16,10822.5
248486,100001,5896632,Closed,currency 1,-879,0,-514.0,-544.0,,0,91620.0,0.0,0.0,0.0,Consumer credit,-155,0.0
248484,100001,5896630,Closed,currency 1,-857,0,-492.0,-553.0,,0,112500.0,0.0,0.0,0.0,Consumer credit,-155,0.0
248488,100001,5896634,Active,currency 1,-559,0,902.0,,,0,337680.0,113166.0,0.0,0.0,Consumer credit,-6,4630.5
675689,100002,6158909,Active,currency 1,-103,0,,,40.5,0,31988.565,0.0,31988.565,0.0,Credit card,-24,0.0
675688,100002,6158908,Closed,currency 1,-645,0,85.0,-36.0,5043.645,0,120735.0,0.0,0.0,0.0,Consumer credit,-34,0.0
675687,100002,6158907,Closed,currency 1,-1121,0,-911.0,-911.0,3321.0,0,19071.0,,,0.0,Consumer credit,-906,0.0


(27299925, 3)

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C
5,5715448,-5,C
6,5715448,-6,C
7,5715448,-7,C
8,5715448,-8,C
9,5715448,-9,0


In [5]:
bureau_balance_features = pd.concat([
    bureau_balance.MONTHS_BALANCE.groupby(bureau_balance.SK_ID_BUREAU).agg(['max','count']),
    bureau_balance.STATUS.groupby(bureau_balance.SK_ID_BUREAU).nunique(),
    bureau_balance.STATUS.groupby(bureau_balance.SK_ID_BUREAU).agg(lambda x: ','.join(set(x))),
    bureau_balance.STATUS.groupby(bureau_balance.SK_ID_BUREAU).agg(lambda x: list(x)[0]),
],axis = 1)
bureau_balance_features.columns = ['MONTHS_BALANCE_max','MONTHS_BALANCE_cnt','STATUS_cntd','STATUS_set','latest_STATUS']
bureau_balance_features = bureau_balance_features.reset_index()

In [6]:
bureau_balance_features.shape
bureau_balance_features.head(10)

(817395, 6)

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE_max,MONTHS_BALANCE_cnt,STATUS_cntd,STATUS_set,latest_STATUS
0,5001709,0,97,2,"X,C",C
1,5001710,0,83,3,"X,C,0",C
2,5001711,0,4,2,"X,0",X
3,5001712,0,19,2,"C,0",C
4,5001713,0,22,1,X,X
5,5001714,0,15,1,X,X
6,5001715,0,60,1,X,X
7,5001716,0,86,3,"X,C,0",C
8,5001717,0,22,2,"C,0",C
9,5001718,0,39,4,"X,C,0,1",C


In [None]:
bureau = bureau.merge(bureau_balance_features,how='left',on='SK_ID_BUREAU')
bureau.shape
bureau.head()

In [15]:
bureau.shape
bureau.head()
bureau.STATUS_set.value_counts()

(1716428, 22)

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,...,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,MONTHS_BALANCE_max,MONTHS_BALANCE_cnt,STATUS_cntd,STATUS_set,latest_STATUS
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,...,,0.0,Consumer credit,-131,,,,,,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,...,,0.0,Credit card,-20,,,,,,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,...,,0.0,Consumer credit,-16,,,,,,
3,215354,5714465,Active,currency 1,-203,0,,,,0,...,,0.0,Credit card,-16,,,,,,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,...,,0.0,Consumer credit,-21,,,,,,


C,0                213717
X,0                161384
X,C,0              116595
X                   80209
0                   73077
C,0,1               30799
X,0,1               24578
X,C                 21931
X,C,0,1             15615
0,1                 12644
C                    8790
C,0,2,1              2395
X,1,C,2,0            1544
X,2,0,1              1233
1,C,2,5,0,4,3        1065
X,C,0,2               622
2,0,1                 615
X,1,C,2,5,0,4,3       601
1,C,2,0,3             584
X,1,2,5,0,4,3         477
X,1,C,2,0,3           467
C,0,2                 418
1,2,5,0,4,3           355
5,0                   346
X,2,0                 337
1,C,2,0,4,3           302
X,1,2,0,3             274
X,C,1                 200
X,5,0                 192
X,1                   179
                    ...  
X,1,5,0,4               2
3,2,5,0                 2
2,1                     2
C,3,2,1                 2
X,C,3                   2
3,X,4,0                 2
X,1,0,4,3               2
3,0,1       

In [17]:

bureau_sum = ['CREDIT_DAY_OVERDUE','DAYS_CREDIT_ENDDATE','DAYS_ENDDATE_FACT','DAYS_CREDIT',
               'DAYS_CREDIT_UPDATE','CNT_CREDIT_PROLONG','AMT_CREDIT_SUM','AMT_ANNUITY',
               'AMT_CREDIT_SUM_DEBT','AMT_CREDIT_SUM_LIMIT','AMT_CREDIT_MAX_OVERDUE','MONTHS_BALANCE_max',
              'MONTHS_BALANCE_cnt','STATUS_cntd']

bureau_max = ['AMT_CREDIT_MAX_OVERDUE','CNT_CREDIT_PROLONG', 'DAYS_CREDIT','MONTHS_BALANCE_max','STATUS_cntd']
bureau_countd = ['CREDIT_CURRENCY','CREDIT_ACTIVE','CREDIT_TYPE','STATUS_set','latest_STATUS']

bureau_features = pd.concat([
    bureau.SK_ID_CURR.value_counts(),
    bureau[bureau_sum].groupby(bureau.SK_ID_CURR).sum(),
    bureau[bureau_max].groupby(bureau.SK_ID_CURR).max(),
    bureau[bureau_countd].groupby(bureau.SK_ID_CURR).nunique(),
    bureau[['CREDIT_CURRENCY','CREDIT_ACTIVE','CREDIT_TYPE','latest_STATUS']].groupby(bureau.SK_ID_CURR).agg(lambda x: set(x)),
    bureau['STATUS_set'].astype(str).groupby(bureau.SK_ID_CURR).agg(lambda x: ','.join(set(','.join(x).split(',')))),
], axis = 1)

bureau_features.columns = ['bureau_cnt'] + [f+'_sum' for f in bureau_sum] + [f+'_max' for f in bureau_max] + [f+'_cnt' for f in bureau_countd] + [f+'_set' for f in ['CREDIT_CURRENCY','CREDIT_ACTIVE','CREDIT_TYPE','latest_STATUS']] + ['STATUS_set']
bureau_features = bureau_features.reset_index()


In [18]:
bureau_features.shape
bureau_features.head()

(305811, 31)

Unnamed: 0,SK_ID_CURR,bureau_cnt,CREDIT_DAY_OVERDUE_sum,DAYS_CREDIT_ENDDATE_sum,DAYS_ENDDATE_FACT_sum,DAYS_CREDIT_sum,DAYS_CREDIT_UPDATE_sum,CNT_CREDIT_PROLONG_sum,AMT_CREDIT_SUM_sum,AMT_ANNUITY_sum,...,CREDIT_CURRENCY_cnt,CREDIT_ACTIVE_cnt,CREDIT_TYPE_cnt,STATUS_set_cnt,latest_STATUS_cnt,CREDIT_CURRENCY_set,CREDIT_ACTIVE_set,CREDIT_TYPE_set,latest_STATUS_set,STATUS_set
0,100001,7,0,577.0,-3302.0,-5145,-652,0,1453365.0,24817.5,...,1,2,1,5,3,{currency 1},"{Closed, Active}",{Consumer credit},"{C, 0, 1}","X,C,0,1"
1,100002,8,0,-2094.0,-4185.0,-6992,-3999,0,865055.565,0.0,...,1,2,2,4,2,{currency 1},"{Closed, Active}","{Credit card, Consumer credit}","{C, 0}","X,C,0,1"
2,100003,4,0,-2178.0,-3292.0,-5603,-3264,0,1017400.5,,...,1,2,2,0,0,{currency 1},"{Closed, Active}","{Credit card, Consumer credit}",{nan},
3,100004,2,0,-977.0,-1065.0,-1734,-1064,0,189037.8,,...,1,1,1,0,0,{currency 1},{Closed},{Consumer credit},{nan},
4,100005,3,0,1318.0,-123.0,-572,-163,0,657126.0,4261.5,...,1,2,2,3,3,{currency 1},"{Closed, Active}","{Credit card, Consumer credit}","{X, C, 0}","X,C,0"


In [19]:
bureau_features.to_csv('./data/rawdata/bureau_feature.csv',index=False)