# Setup

In [2]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

In [5]:
bureau = pd.read_csv("../data/bureau.csv")
bureau

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.00,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.00,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.50,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.00,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.00,,,0.0,Consumer credit,-21,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1716423,259355,5057750,Active,currency 1,-44,0,-30.0,,0.0,0,11250.00,11250.0,0.0,0.0,Microloan,-19,
1716424,100044,5057754,Closed,currency 1,-2648,0,-2433.0,-2493.0,5476.5,0,38130.84,0.0,0.0,0.0,Consumer credit,-2493,
1716425,100044,5057762,Closed,currency 1,-1809,0,-1628.0,-970.0,,0,15570.00,,,0.0,Consumer credit,-967,
1716426,246829,5057770,Closed,currency 1,-1878,0,-1513.0,-1513.0,,0,36000.00,0.0,0.0,0.0,Consumer credit,-1508,


- ID of loan in our sample one loan in our sample can have 0,1,2 or more related previous credits in credit bureau 
- Recoded ID of previous Credit Bureau credit related to our loan (unique coding for each loan application)
- Status of the Credit Bureau (CB) reported credits
- Recoded currency of the Credit Bureau credit
- How many days before current application did client apply for Credit Bureau credit
- Number of days past due on CB credit at the time of application for related loan in our sample
- Remaining duration of CB credit (in days) at the time of application in Home Credit
- Days since CB credit ended at the time of application in Home Credit (only for closed credit)
- Maximal amount overdue on the Credit Bureau credit so far (at application date of loan in our sample)
- How many times was the Credit Bureau credit prolonged
- Current credit amount for the Credit Bureau credit
- Current debt on Credit Bureau credit
- Current credit limit of credit card reported in Credit Bureau
- Current amount overdue on Credit Bureau credit
- Type of Credit Bureau credit (Car, cash,...)
- How many days before loan application did last information about the Credit Bureau credit come
- Annuity of the Credit Bureau credit

In [3]:
len(bureau.SK_ID_BUREAU.unique())

1716428

SK_ID_BUREAU are unique row identifiers in the bureau dataset
    meaning that the bureau has multiple applications that are linked to one current bank credit

In [4]:
print("Percentage of rows empty: ")
print("")
print(bureau.isna().sum() / len(bureau.SK_ID_CURR) * 100)

Percentage of rows empty: 

SK_ID_CURR                 0.000000
SK_ID_BUREAU               0.000000
CREDIT_ACTIVE              0.000000
CREDIT_CURRENCY            0.000000
DAYS_CREDIT                0.000000
CREDIT_DAY_OVERDUE         0.000000
DAYS_CREDIT_ENDDATE        6.149573
DAYS_ENDDATE_FACT         36.916958
AMT_CREDIT_MAX_OVERDUE    65.513264
CNT_CREDIT_PROLONG         0.000000
AMT_CREDIT_SUM             0.000757
AMT_CREDIT_SUM_DEBT       15.011932
AMT_CREDIT_SUM_LIMIT      34.477415
AMT_CREDIT_SUM_OVERDUE     0.000000
CREDIT_TYPE                0.000000
DAYS_CREDIT_UPDATE         0.000000
AMT_ANNUITY               71.473490
dtype: float64


# Bureau table

## Active credits

Calculate the share of active credits of total credits known of the creditor

In [5]:
bureau["credit_active_dummy"] = [1 if d == "Active" else 0 for d in bureau.CREDIT_ACTIVE]

share_of_active_credits_of_total_credits = bureau[["SK_ID_CURR", "credit_active_dummy"]].groupby("SK_ID_CURR").agg(np.mean)
share_of_active_credits_of_total_credits.columns = ['share_of_active_credits_of_total_credits']

number_of_credits_registered = bureau[["SK_ID_CURR"]].groupby("SK_ID_CURR")['SK_ID_CURR'].count()
number_of_credits_registered = pd.DataFrame(number_of_credits_registered)
number_of_credits_registered.columns = ['number_of_credits_registered']

## Last credit application

In [6]:
bureau.DAYS_CREDIT.value_counts()

-364    1330
-336    1248
-273    1238
-357    1218
-343    1203
        ... 
-4       113
-3        74
-2        42
 0        25
-1        17
Name: DAYS_CREDIT, Length: 2923, dtype: int64

In [29]:
last_credit_application_before_mortgage = bureau[["SK_ID_CURR", "DAYS_CREDIT"]].groupby("SK_ID_CURR").agg(np.min)
last_credit_application_before_mortgage.columns = ["last_credit_application_before_mortgage"]
last_credit_application_before_mortgage

Unnamed: 0_level_0,last_credit_application_before_mortgage
SK_ID_CURR,Unnamed: 1_level_1
100001,-1572
100002,-1437
100003,-2586
100004,-1326
100005,-373
...,...
456249,-2713
456250,-1002
456253,-919
456254,-1104


## Days overdue

In [8]:
bureau.CREDIT_DAY_OVERDUE.value_counts()

0       1712211
30          311
60          126
8           103
13          103
         ...   
1445          1
278           1
707           1
2193          1
1931          1
Name: CREDIT_DAY_OVERDUE, Length: 942, dtype: int64

In [94]:
bureau.CREDIT_DAY_OVERDUE.isna().sum()

0

In [110]:
std_days_overdue_on_one_credit = bureau[["SK_ID_CURR", "CREDIT_DAY_OVERDUE"]].groupby("SK_ID_CURR").agg(np.std)
std_days_overdue_on_one_credit.columns = ["std_days_overdue_on_one_credit"]
std_days_overdue_on_one_credit["std_days_overdue_on_one_credit"] = [d if .isnan() else 0 for d in std_days_overdue_on_one_credit["std_days_overdue_on_one_credit"]]
std_days_overdue_on_one_credit

TypeError: 'module' object is not callable

In [102]:
pd.DataFrame(std_days_overdue_on_one_credit).isna().sum()

std_days_overdue_on_one_credit    0
dtype: int64

## Credit amount

This is all credit amount, might make more sense to look only at active credit amounts. See below

In [10]:
bureau.AMT_CREDIT_SUM.value_counts()

0.000         66582
225000.000    57608
135000.000    50195
450000.000    37156
90000.000     36940
              ...  
18915.030         1
167026.095        1
57044.250         1
69295.410         1
108765.720        1
Name: AMT_CREDIT_SUM, Length: 236708, dtype: int64

In [11]:
total_amount_of_credit_taken = bureau[["SK_ID_CURR", "AMT_CREDIT_SUM"]].groupby("SK_ID_CURR").agg(np.sum)
total_amount_of_credit_taken.columns = ["total_amount_of_credit_taken"]
total_amount_of_credit_taken

Unnamed: 0_level_0,total_amount_of_credit_taken
SK_ID_CURR,Unnamed: 1_level_1
100001,1453365.000
100002,865055.565
100003,1017400.500
100004,189037.800
100005,657126.000
...,...
456249,3693858.660
456250,3086459.550
456253,3960000.000
456254,45000.000


This is only active credit amounts

In [81]:
bureau["active_AMT_CREDIT_SUM"] = bureau["AMT_CREDIT_SUM"] * bureau["credit_active_dummy"]
active_total_amount_of_credit_taken = bureau[["SK_ID_CURR", "active_AMT_CREDIT_SUM"]].groupby("SK_ID_CURR").agg(np.sum)
active_total_amount_of_credit_taken.columns = ["active_total_amount_of_credit_taken"]
active_total_amount_of_credit_taken

Unnamed: 0_level_0,active_total_amount_of_credit_taken
SK_ID_CURR,Unnamed: 1_level_1
100001,884025.000
100002,481988.565
100003,810000.000
100004,0.000
100005,598626.000
...,...
456249,405000.000
456250,2603110.050
456253,2610000.000
456254,0.000


## Credit amount overdue

This is all credit amount overdue, might make more sense to look only at active credit amounts overdue. See below

In [12]:
bureau.AMT_CREDIT_SUM_OVERDUE.value_counts()

0.000       1712270
4.500           301
9.000           107
13.500           81
18.000           72
             ...   
323.010           1
88.695            1
142.830           1
5069.070          1
352.620           1
Name: AMT_CREDIT_SUM_OVERDUE, Length: 1616, dtype: int64

In [13]:
total_amount_of_credit_overdue = bureau[["SK_ID_CURR", "AMT_CREDIT_SUM_OVERDUE"]].groupby("SK_ID_CURR").agg(np.sum)
total_amount_of_credit_overdue.columns = ["total_amount_of_credit_overdue"]
total_amount_of_credit_overdue

Unnamed: 0_level_0,total_amount_of_credit_overdue
SK_ID_CURR,Unnamed: 1_level_1
100001,0.0
100002,0.0
100003,0.0
100004,0.0
100005,0.0
...,...
456249,0.0
456250,0.0
456253,0.0
456254,0.0


This is only active credit amount overdue

In [82]:
bureau["active_AMT_CREDIT_SUM_OVERDUE"] = bureau["AMT_CREDIT_SUM_OVERDUE"] * bureau["credit_active_dummy"]
active_total_amount_of_credit_overdue = bureau[["SK_ID_CURR", "AMT_CREDIT_SUM_OVERDUE"]].groupby("SK_ID_CURR").agg(np.sum)
active_total_amount_of_credit_overdue.columns = ["active_total_amount_of_credit_overdue"]
active_total_amount_of_credit_overdue

Unnamed: 0_level_0,active_total_amount_of_credit_overdue
SK_ID_CURR,Unnamed: 1_level_1
100001,0.0
100002,0.0
100003,0.0
100004,0.0
100005,0.0
...,...
456249,0.0
456250,0.0
456253,0.0
456254,0.0


## Credit Type

In [14]:
bureau.CREDIT_TYPE.value_counts()

Consumer credit                                 1251615
Credit card                                      402195
Car loan                                          27690
Mortgage                                          18391
Microloan                                         12413
Loan for business development                      1975
Another type of loan                               1017
Unknown type of loan                                555
Loan for working capital replenishment              469
Cash loan (non-earmarked)                            56
Real estate loan                                     27
Loan for the purchase of equipment                   19
Loan for purchase of shares (margin lending)          4
Mobile operator loan                                  1
Interbank credit                                      1
Name: CREDIT_TYPE, dtype: int64

Approach ideas:
- what kind of other credits taken?
- income generating vs consumer credits taken?

I think income generating vs consumer is best

In [15]:
income_generating_credits = ('Loan for business development', 'Loan for working capital replenishment', 'Real estate loan', 'Loan for the purchase of equipment', 'Loan for purchase of shares (margin lending)')

In [16]:
bureau["income_generating_credit"] = [1 if d in income_generating_credits else 0 for d in bureau.CREDIT_TYPE]
number_of_income_generating_credits = bureau[["SK_ID_CURR", "income_generating_credit"]].groupby("SK_ID_CURR").agg(np.sum)
number_of_income_generating_credits.columns = ["number_of_income_generating_credits"]

share_of_income_generating_credits = pd.concat([number_of_income_generating_credits, number_of_credits_registered], axis = 1)

share_of_income_generating_credits["share_of_income_generating_credits"] = share_of_income_generating_credits["number_of_income_generating_credits"] / share_of_income_generating_credits["number_of_credits_registered"]
share_of_income_generating_credits = share_of_income_generating_credits["share_of_income_generating_credits"]
share_of_income_generating_credits

SK_ID_CURR
100001    0.0
100002    0.0
100003    0.0
100004    0.0
100005    0.0
         ... 
456249    0.0
456250    0.0
456253    0.0
456254    0.0
456255    0.0
Name: share_of_income_generating_credits, Length: 305811, dtype: float64

# Bureau balance table

In [None]:
bureau_balance = pd.read_csv("../data/bureau_balance.csv")
bureau_balance

- Recoded ID of Credit Bureau credit (unique coding for each application) use this to join to CREDIT_BUREAU table
- Month of balance relative to application date (-1 means the freshest balance date)
- Status of Credit Bureau loan during the month (active, closed, DPD0-30,� [C means closed, X means status unknown, 0 means no DPD, 1 means maximal did during month between 1-30, 2 means DPD 31-60,� 5 means DPD 120+ or sold or written off ] )

In [21]:
print("Percentage of rows empty: ")
print("")
print(bureau_balance.isna().sum() / len(bureau_balance.SK_ID_BUREAU) * 100)

Percentage of rows empty: 

SK_ID_BUREAU      0.0
MONTHS_BALANCE    0.0
STATUS            0.0
dtype: float64


In [57]:
link_table = bureau[["SK_ID_CURR", "SK_ID_BUREAU"]].drop_duplicates()
link_table

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU
0,215354,5714462
1,215354,5714463
2,215354,5714464
3,215354,5714465
4,215354,5714466
...,...,...
1716423,259355,5057750
1716424,100044,5057754
1716425,100044,5057762
1716426,246829,5057770


In [87]:
bureau_balance.STATUS.value_counts()

C    13646993
0     7499507
X     5810482
1      242347
5       62406
2       23419
3        8924
4        5847
Name: STATUS, dtype: int64

In [88]:
bureau_balance["defaulted"] = [1 if d != "0|1|2|3|C" else 0 for d in bureau_balance["STATUS"]]
bureau_balance["year_before_application"] = [1 if d > -13 else 0 for d in bureau_balance['MONTHS_BALANCE']]
bureau_balance

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS,defaulted,year_before_application
0,5715448,0,C,1,1
1,5715448,-1,C,1,1
2,5715448,-2,C,1,1
3,5715448,-3,C,1,1
4,5715448,-4,C,1,1
...,...,...,...,...,...
27299920,5041336,-47,X,1,0
27299921,5041336,-48,X,1,0
27299922,5041336,-49,X,1,0
27299923,5041336,-50,X,1,0


In [77]:
defaulted_last_year_bureau = bureau_balance[bureau_balance["year_before_application"] == 1][["SK_ID_BUREAU", "defaulted"]].groupby("SK_ID_BUREAU").agg(np.sum)
defaulted_last_year_bureau.columns = ["defaulted_last_year_bureau"]
defaulted_last_year_bureau["defaulted_last_year_bureau"] = [1 if d > 0 else 0 for d in defaulted_last_year_bureau["defaulted_last_year_bureau"]]
defaulted_last_year_bureau = defaulted_last_year_bureau.merge(link_table, on = "SK_ID_BUREAU", how = "left")[["SK_ID_CURR", "defaulted_last_year_bureau"]]

defaulted_last_year_bureau

Unnamed: 0,SK_ID_CURR,defaulted_last_year_bureau
0,,1
1,162368.0,1
2,162368.0,1
3,162368.0,1
4,150635.0,1
...,...,...
682159,387020.0,1
682160,387020.0,1
682161,387020.0,1
682162,387020.0,1


In [80]:
defaulted_last_year = defaulted_last_year_bureau.groupby("SK_ID_CURR").agg(np.sum)
defaulted_last_year["defaulted_last_year"] = [1 if d > 0 else 0 for d in defaulted_last_year["defaulted_last_year_bureau"]]
defaulted_last_year.rename({"defaulted_last_year_bureau" : "number_of_credits_defaulted_last_year"}, inplace = True)
defaulted_last_year

Unnamed: 0_level_0,defaulted_last_year_bureau,defaulted_last_year
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1
100001.0,7,1
100002.0,2,1
100005.0,3,1
100010.0,1,1
100013.0,4,1
...,...,...
456247.0,9,1
456250.0,3,1
456253.0,4,1
456254.0,1,1


# Getting everything together

In [89]:
bureau_reduced = pd.DataFrame(bureau.SK_ID_CURR.unique())
bureau_reduced.columns = ["SK_ID_CURR"]
bureau_reduced

Unnamed: 0,SK_ID_CURR
0,215354
1,162297
2,402440
3,238881
4,222183
...,...
305806,207190
305807,324956
305808,448157
305809,345866


In [90]:
bureau_reduced = bureau_reduced.merge(share_of_active_credits_of_total_credits, on ='SK_ID_CURR', how = 'left')
bureau_reduced = bureau_reduced.merge(number_of_credits_registered, on ='SK_ID_CURR', how = 'left')

bureau_reduced = bureau_reduced.merge(last_credit_application_before_mortgage, on ='SK_ID_CURR', how = 'left')

bureau_reduced = bureau_reduced.merge(std_days_overdue_on_one_credit, on ='SK_ID_CURR', how = 'left')

bureau_reduced = bureau_reduced.merge(total_amount_of_credit_taken, on ='SK_ID_CURR', how = 'left')
bureau_reduced = bureau_reduced.merge(active_total_amount_of_credit_taken, on ='SK_ID_CURR', how = 'left')

bureau_reduced = bureau_reduced.merge(total_amount_of_credit_overdue, on ='SK_ID_CURR', how = 'left')
bureau_reduced = bureau_reduced.merge(active_total_amount_of_credit_overdue, on ='SK_ID_CURR', how = 'left')

bureau_reduced = bureau_reduced.merge(share_of_income_generating_credits, on ='SK_ID_CURR', how = 'left')
bureau_reduced = bureau_reduced.merge(defaulted_last_year, on ='SK_ID_CURR', how = 'left')

bureau_reduced

Unnamed: 0,SK_ID_CURR,share_of_active_credits_of_total_credits,number_of_credits_registered,last_credit_application_before_mortgage,std_days_overdue_on_one_credit,total_amount_of_credit_taken,active_total_amount_of_credit_taken,total_amount_of_credit_overdue,active_total_amount_of_credit_overdue,share_of_income_generating_credits,defaulted_last_year_bureau,defaulted_last_year
0,215354,0.545455,11,-1872,0.0,5973945.30,3701427.3,0.0,0.0,0.0,,
1,162297,0.500000,6,-2456,0.0,8230386.15,7375500.0,0.0,0.0,0.0,,
2,402440,1.000000,1,-96,,89910.00,89910.0,0.0,0.0,0.0,,
3,238881,0.375000,8,-2911,0.0,1285239.06,769500.0,0.0,0.0,0.0,,
4,222183,0.625000,8,-2744,0.0,7158960.00,5187393.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
305806,207190,0.000000,1,-532,,450000.00,0.0,0.0,0.0,0.0,1.0,1.0
305807,324956,0.000000,1,-381,,19800.00,0.0,0.0,0.0,0.0,1.0,1.0
305808,448157,1.000000,1,-1441,,1800000.00,1800000.0,0.0,0.0,0.0,,
305809,345866,0.000000,1,-375,,175054.50,0.0,0.0,0.0,0.0,1.0,1.0


In [91]:
bureau_reduced.isna().sum()

SK_ID_CURR                                       0
share_of_active_credits_of_total_credits         0
number_of_credits_registered                     0
last_credit_application_before_mortgage          0
std_days_overdue_on_one_credit               41520
total_amount_of_credit_taken                     0
active_total_amount_of_credit_taken              0
total_amount_of_credit_overdue                   0
active_total_amount_of_credit_overdue            0
share_of_income_generating_credits               0
defaulted_last_year_bureau                  173445
defaulted_last_year                         173445
dtype: int64