# Setup

In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

# Bureau table

### Load data

In [2]:
bureau = pd.read_csv("data/bureau.csv")
bureau

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.00,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.00,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.50,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.00,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.00,,,0.0,Consumer credit,-21,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1716423,259355,5057750,Active,currency 1,-44,0,-30.0,,0.0,0,11250.00,11250.0,0.0,0.0,Microloan,-19,
1716424,100044,5057754,Closed,currency 1,-2648,0,-2433.0,-2493.0,5476.5,0,38130.84,0.0,0.0,0.0,Consumer credit,-2493,
1716425,100044,5057762,Closed,currency 1,-1809,0,-1628.0,-970.0,,0,15570.00,,,0.0,Consumer credit,-967,
1716426,246829,5057770,Closed,currency 1,-1878,0,-1513.0,-1513.0,,0,36000.00,0.0,0.0,0.0,Consumer credit,-1508,


- ID of loan in our sample one loan in our sample can have 0,1,2 or more related previous credits in credit bureau 
- Recoded ID of previous Credit Bureau credit related to our loan (unique coding for each loan application)
- Status of the Credit Bureau (CB) reported credits
- Recoded currency of the Credit Bureau credit
- How many days before current application did client apply for Credit Bureau credit
- Number of days past due on CB credit at the time of application for related loan in our sample
- Remaining duration of CB credit (in days) at the time of application in Home Credit
- Days since CB credit ended at the time of application in Home Credit (only for closed credit)
- Maximal amount overdue on the Credit Bureau credit so far (at application date of loan in our sample)
- How many times was the Credit Bureau credit prolonged
- Current credit amount for the Credit Bureau credit
- Current debt on Credit Bureau credit
- Current credit limit of credit card reported in Credit Bureau
- Current amount overdue on Credit Bureau credit
- Type of Credit Bureau credit (Car, cash,...)
- How many days before loan application did last information about the Credit Bureau credit come
- Annuity of the Credit Bureau credit

### Overview

In [3]:
len(bureau.SK_ID_BUREAU.unique())

1716428

SK_ID_BUREAU are unique row identifiers in the bureau dataset
    meaning that the bureau has multiple applications that are linked to one current bank credit

In [4]:
print("Percentage of rows empty: ")
print("")
print(bureau.isna().sum() / len(bureau.SK_ID_CURR) * 100)

Percentage of rows empty: 

SK_ID_CURR                 0.000000
SK_ID_BUREAU               0.000000
CREDIT_ACTIVE              0.000000
CREDIT_CURRENCY            0.000000
DAYS_CREDIT                0.000000
CREDIT_DAY_OVERDUE         0.000000
DAYS_CREDIT_ENDDATE        6.149573
DAYS_ENDDATE_FACT         36.916958
AMT_CREDIT_MAX_OVERDUE    65.513264
CNT_CREDIT_PROLONG         0.000000
AMT_CREDIT_SUM             0.000757
AMT_CREDIT_SUM_DEBT       15.011932
AMT_CREDIT_SUM_LIMIT      34.477415
AMT_CREDIT_SUM_OVERDUE     0.000000
CREDIT_TYPE                0.000000
DAYS_CREDIT_UPDATE         0.000000
AMT_ANNUITY               71.473490
dtype: float64


### Aggregation to mortgage ID

In [5]:
income_generating_credits = (
    'Loan for business development', \
    'Loan for working capital replenishment', \
    'Real estate loan', \
    'Loan for the purchase of equipment', \
    'Loan for purchase of shares (margin lending)', \
    )

In [6]:
bureau["credit_active_dummy"] = [1 if d == "Active" else 0 for d in bureau.CREDIT_ACTIVE]
bureau["active_AMT_CREDIT_SUM"] = bureau["AMT_CREDIT_SUM"] * bureau["credit_active_dummy"]
bureau["active_AMT_CREDIT_SUM_OVERDUE"] = bureau["AMT_CREDIT_SUM_OVERDUE"] * bureau["credit_active_dummy"]
bureau["income_generating_credit"] = [1 if d in income_generating_credits else 0 for d in bureau.CREDIT_TYPE]

In [7]:
def aggregate_variable(name, func):
    a = bureau[["SK_ID_CURR", name]].groupby("SK_ID_CURR").agg(func)
    if func == np.std:
        a[a[name].isna()] = 0
    return a

In [8]:
transformations = {
    "number_of_credits_registered" : pd.DataFrame({"number_of_credits_registered" : bureau[["SK_ID_CURR"]].groupby("SK_ID_CURR")['SK_ID_CURR'].count()}),
    "share_of_active_credits_of_total_credits" : aggregate_variable("credit_active_dummy", np.mean),
    "last_credit_application_before_mortgage" : aggregate_variable("DAYS_CREDIT", np.min),
    "std_days_overdue_on_one_credit" : aggregate_variable("CREDIT_DAY_OVERDUE", np.std),
    "total_amount_of_credit_taken" : aggregate_variable("AMT_CREDIT_SUM", np.sum),
    "active_total_amount_of_credit_taken" : aggregate_variable("active_AMT_CREDIT_SUM", np.sum),
    "total_amount_of_credit_overdue" : aggregate_variable("AMT_CREDIT_SUM_OVERDUE", np.sum),
    "active_total_amount_of_credit_overdue" : aggregate_variable("AMT_CREDIT_SUM_OVERDUE", np.sum),
    "number_of_income_generating_credits" : aggregate_variable("income_generating_credit", np.sum),
}

# Bureau balance table

### Load data

In [9]:
bureau_balance = pd.read_csv("data/bureau_balance.csv")
bureau_balance

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C
...,...,...,...
27299920,5041336,-47,X
27299921,5041336,-48,X
27299922,5041336,-49,X
27299923,5041336,-50,X


- Recoded ID of Credit Bureau credit (unique coding for each application) use this to join to CREDIT_BUREAU table
- Month of balance relative to application date (-1 means the freshest balance date)
- Status of Credit Bureau loan during the month (active, closed, DPD0-30,� [C means closed, X means status unknown, 0 means no DPD, 1 means maximal did during month between 1-30, 2 means DPD 31-60,� 5 means DPD 120+ or sold or written off ] )

### Overview

In [10]:
print("Percentage of rows empty: ")
print("")
print(bureau_balance.isna().sum() / len(bureau_balance.SK_ID_BUREAU) * 100)

Percentage of rows empty: 

SK_ID_BUREAU      0.0
MONTHS_BALANCE    0.0
STATUS            0.0
dtype: float64


In [11]:
bureau_balance.STATUS.value_counts()

C    13646993
0     7499507
X     5810482
1      242347
5       62406
2       23419
3        8924
4        5847
Name: STATUS, dtype: int64

### Aggregation to mortgage ID

In [12]:
bureau_balance["issues"] = [1 if d != "0|1|2|3|C" else 0 for d in bureau_balance["STATUS"]]
issues_last_year_bureau = bureau_balance[bureau_balance["MONTHS_BALANCE"] > -13][["SK_ID_BUREAU", "issues"]].groupby("SK_ID_BUREAU").agg(np.max)
issues_last_year = issues_last_year_bureau.merge(bureau[["SK_ID_CURR", "SK_ID_BUREAU"]].drop_duplicates(), on = "SK_ID_BUREAU", how = "left").groupby("SK_ID_CURR").agg(np.max).iloc[:, 1]

transformations["issues_last_year"] = issues_last_year

# Joining all created features into one table

### Merge all features based on mortgage ID

In [13]:
bureau_reduced = pd.DataFrame({
    "SK_ID_CURR" : bureau.SK_ID_CURR.unique()
})

for feature in transformations.keys():
    bureau_reduced = bureau_reduced.merge(transformations[feature], on ='SK_ID_CURR', how = 'left')

bureau_reduced.columns = ["SK_ID_CURR"] + list(transformations.keys())

### Overview of final table

In [14]:
bureau_reduced.isna().sum()

SK_ID_CURR                                       0
number_of_credits_registered                     0
share_of_active_credits_of_total_credits         0
last_credit_application_before_mortgage          0
std_days_overdue_on_one_credit                   0
total_amount_of_credit_taken                     0
active_total_amount_of_credit_taken              0
total_amount_of_credit_overdue                   0
active_total_amount_of_credit_overdue            0
number_of_income_generating_credits              0
issues_last_year                            173445
dtype: int64

In [15]:
bureau_reduced

Unnamed: 0,SK_ID_CURR,number_of_credits_registered,share_of_active_credits_of_total_credits,last_credit_application_before_mortgage,std_days_overdue_on_one_credit,total_amount_of_credit_taken,active_total_amount_of_credit_taken,total_amount_of_credit_overdue,active_total_amount_of_credit_overdue,number_of_income_generating_credits,issues_last_year
0,215354,11,0.545455,-1872,0.0,5973945.30,3701427.3,0.0,0.0,0,
1,162297,6,0.500000,-2456,0.0,8230386.15,7375500.0,0.0,0.0,0,
2,402440,1,1.000000,-96,0.0,89910.00,89910.0,0.0,0.0,0,
3,238881,8,0.375000,-2911,0.0,1285239.06,769500.0,0.0,0.0,0,
4,222183,8,0.625000,-2744,0.0,7158960.00,5187393.0,0.0,0.0,0,
...,...,...,...,...,...,...,...,...,...,...,...
305806,207190,1,0.000000,-532,0.0,450000.00,0.0,0.0,0.0,0,1.0
305807,324956,1,0.000000,-381,0.0,19800.00,0.0,0.0,0.0,0,1.0
305808,448157,1,1.000000,-1441,0.0,1800000.00,1800000.0,0.0,0.0,0,
305809,345866,1,0.000000,-375,0.0,175054.50,0.0,0.0,0.0,0,1.0


### Manual corrections

In [16]:
bureau_reduced["share_of_income_generating_credits"] = bureau_reduced["number_of_income_generating_credits"] / bureau_reduced["number_of_credits_registered"]

In [17]:
bureau_reduced[bureau_reduced["issues_last_year"].isna()] = 0 # people where no information assume no issues?

In [18]:
bureau_reduced.isna().sum()

SK_ID_CURR                                  0
number_of_credits_registered                0
share_of_active_credits_of_total_credits    0
last_credit_application_before_mortgage     0
std_days_overdue_on_one_credit              0
total_amount_of_credit_taken                0
active_total_amount_of_credit_taken         0
total_amount_of_credit_overdue              0
active_total_amount_of_credit_overdue       0
number_of_income_generating_credits         0
issues_last_year                            0
share_of_income_generating_credits          0
dtype: int64

# Some tests

In [19]:
problematic = (
    "std_days_overdue_on_one_credit",
    "total_amount_of_credit_overdue",
    "active_total_amount_of_credit_overdue",
    "number_of_income_generating_credits",
    "share_of_income_generating_credits",
    )

In [25]:
for var in problematic:
    print(bureau_reduced[var].value_counts())

0.000000       304219
10.606602          21
13.416408          20
11.338934          18
15.000000          17
                ...  
8.043153            1
1395.121679         1
39.600084           1
121.622366          1
33.941125           1
Name: std_days_overdue_on_one_credit, Length: 963, dtype: int64
0.000          304139
4.500             117
9.000              34
13.500             33
22.500             31
                ...  
948.060             1
516.060             1
1044.000            1
2079509.895         1
22581.000           1
Name: total_amount_of_credit_overdue, Length: 714, dtype: int64
0.000          304139
4.500             117
9.000              34
13.500             33
22.500             31
                ...  
948.060             1
516.060             1
1044.000            1
2079509.895         1
22581.000           1
Name: active_total_amount_of_credit_overdue, Length: 714, dtype: int64
0     304868
1        800
2         97
3         31
4         10
5         