# Setup

In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

# Helper functions

# Bureau table

## Load data

In [2]:
bureau = pd.read_csv("../data/bureau.csv")
bureau

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.00,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.00,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.50,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.00,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.00,,,0.0,Consumer credit,-21,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1716423,259355,5057750,Active,currency 1,-44,0,-30.0,,0.0,0,11250.00,11250.0,0.0,0.0,Microloan,-19,
1716424,100044,5057754,Closed,currency 1,-2648,0,-2433.0,-2493.0,5476.5,0,38130.84,0.0,0.0,0.0,Consumer credit,-2493,
1716425,100044,5057762,Closed,currency 1,-1809,0,-1628.0,-970.0,,0,15570.00,,,0.0,Consumer credit,-967,
1716426,246829,5057770,Closed,currency 1,-1878,0,-1513.0,-1513.0,,0,36000.00,0.0,0.0,0.0,Consumer credit,-1508,


- ID of loan in our sample one loan in our sample can have 0,1,2 or more related previous credits in credit bureau 
- Recoded ID of previous Credit Bureau credit related to our loan (unique coding for each loan application)
- Status of the Credit Bureau (CB) reported credits
- Recoded currency of the Credit Bureau credit
- How many days before current application did client apply for Credit Bureau credit
- Number of days past due on CB credit at the time of application for related loan in our sample
- Remaining duration of CB credit (in days) at the time of application in Home Credit
- Days since CB credit ended at the time of application in Home Credit (only for closed credit)
- Maximal amount overdue on the Credit Bureau credit so far (at application date of loan in our sample)
- How many times was the Credit Bureau credit prolonged
- Current credit amount for the Credit Bureau credit
- Current debt on Credit Bureau credit
- Current credit limit of credit card reported in Credit Bureau
- Current amount overdue on Credit Bureau credit
- Type of Credit Bureau credit (Car, cash,...)
- How many days before loan application did last information about the Credit Bureau credit come
- Annuity of the Credit Bureau credit

In [4]:
income_generating_credits = ('Loan for business development', 'Loan for working capital replenishment', 'Real estate loan', 'Loan for the purchase of equipment', 'Loan for purchase of shares (margin lending)')

In [5]:
bureau["credit_active_dummy"] = [1 if d == "Active" else 0 for d in bureau.CREDIT_ACTIVE]
bureau["active_AMT_CREDIT_SUM"] = bureau["AMT_CREDIT_SUM"] * bureau["credit_active_dummy"]
bureau["active_AMT_CREDIT_SUM_OVERDUE"] = bureau["AMT_CREDIT_SUM_OVERDUE"] * bureau["credit_active_dummy"]
bureau["income_generating_credit"] = [1 if d in income_generating_credits else 0 for d in bureau.CREDIT_TYPE]

In [8]:
def aggregate_variable(name, func):
    a = bureau[["SK_ID_CURR", name]].groupby("SK_ID_CURR").agg(func)
    if func == np.std:
        a[a[name].isna()] = 0
    return a

In [9]:
transformations = {
    "number_of_credits_registered" : bureau[["SK_ID_CURR"]].groupby("SK_ID_CURR")['SK_ID_CURR'].count(),
    "share_of_active_credits_of_total_credits" : aggregate_variable("credit_active_dummy", np.mean),
    "last_credit_application_before_mortgage" : aggregate_variable("DAYS_CREDIT", np.min),
    "std_days_overdue_on_one_credit" : aggregate_variable("CREDIT_DAY_OVERDUE", np.std),
    "total_amount_of_credit_taken" : aggregate_variable("AMT_CREDIT_SUM", np.sum),
    "active_total_amount_of_credit_taken" : aggregate_variable("active_AMT_CREDIT_SUM", np.sum),
    "total_amount_of_credit_overdue" : aggregate_variable("AMT_CREDIT_SUM_OVERDUE", np.sum),
    "active_total_amount_of_credit_overdue" : aggregate_variable("AMT_CREDIT_SUM_OVERDUE", np.sum),
    "number_of_income_generating_credits" : aggregate_variable("income_generating_credit", np.sum),
    #"share_of_income_generating_credits" : number_of_income_generating_credits / number_of_credits_registered
}

In [10]:
transformations

{'number_of_credits_registered': SK_ID_CURR
 100001     7
 100002     8
 100003     4
 100004     2
 100005     3
           ..
 456249    13
 456250     3
 456253     4
 456254     1
 456255    11
 Name: SK_ID_CURR, Length: 305811, dtype: int64,
 'share_of_active_credits_of_total_credits':             credit_active_dummy
 SK_ID_CURR                     
 100001                 0.428571
 100002                 0.250000
 100003                 0.250000
 100004                 0.000000
 100005                 0.666667
 ...                         ...
 456249                 0.153846
 456250                 0.666667
 456253                 0.500000
 456254                 0.000000
 456255                 0.454545
 
 [305811 rows x 1 columns],
 'last_credit_application_before_mortgage':             DAYS_CREDIT
 SK_ID_CURR             
 100001            -1572
 100002            -1437
 100003            -2586
 100004            -1326
 100005             -373
 ...                 ...
 456249

## Overview

In [None]:
len(bureau.SK_ID_BUREAU.unique())

SK_ID_BUREAU are unique row identifiers in the bureau dataset
    meaning that the bureau has multiple applications that are linked to one current bank credit

In [None]:
print("Percentage of rows empty: ")
print("")
print(bureau.isna().sum() / len(bureau.SK_ID_CURR) * 100)

## Feature creation

### Active credits

Calculate the share of active credits of total credits known of the creditor

In [None]:
bureau["credit_active_dummy"] = [1 if d == "Active" else 0 for d in bureau.CREDIT_ACTIVE]

share_of_active_credits_of_total_credits = bureau[["SK_ID_CURR", "credit_active_dummy"]].groupby("SK_ID_CURR").agg(np.mean)
share_of_active_credits_of_total_credits.columns = ['share_of_active_credits_of_total_credits']

number_of_credits_registered = bureau[["SK_ID_CURR"]].groupby("SK_ID_CURR")['SK_ID_CURR'].count()
number_of_credits_registered = pd.DataFrame(number_of_credits_registered)
number_of_credits_registered.columns = ['number_of_credits_registered']

### Last credit application

In [None]:
bureau.DAYS_CREDIT.value_counts()

In [None]:
last_credit_application_before_mortgage = aggregate_variable("DAYS_CREDIT", "last_credit_application_before_mortgage", np.min)
last_credit_application_before_mortgage

### Days overdue

In [None]:
bureau.CREDIT_DAY_OVERDUE.value_counts()

In [None]:
bureau.CREDIT_DAY_OVERDUE.isna().sum()

In [None]:
std_days_overdue_on_one_credit = aggregate_variable("CREDIT_DAY_OVERDUE", "std_days_overdue_on_one_credit", np.std)
std_days_overdue_on_one_credit[std_days_overdue_on_one_credit["std_days_overdue_on_one_credit"].isna()] = 0
std_days_overdue_on_one_credit

In [None]:
pd.DataFrame(std_days_overdue_on_one_credit).isna().sum()

### Credit amount

This is all credit amount, might make more sense to look only at active credit amounts. See below

In [None]:
bureau.AMT_CREDIT_SUM.value_counts()

In [None]:
total_amount_of_credit_taken = aggregate_variable("AMT_CREDIT_SUM", "total_amount_of_credit_taken", np.sum)
total_amount_of_credit_taken

This is only active credit amounts

In [None]:
bureau["active_AMT_CREDIT_SUM"] = bureau["AMT_CREDIT_SUM"] * bureau["credit_active_dummy"]
active_total_amount_of_credit_taken = aggregate_variable("active_AMT_CREDIT_SUM", "active_total_amount_of_credit_taken", np.sum)
active_total_amount_of_credit_taken

### Credit amount overdue

This is all credit amount overdue, might make more sense to look only at active credit amounts overdue. See below

In [None]:
bureau.AMT_CREDIT_SUM_OVERDUE.value_counts()

In [None]:
total_amount_of_credit_overdue = aggregate_variable("AMT_CREDIT_SUM_OVERDUE", "total_amount_of_credit_overdue", np.sum)
total_amount_of_credit_overdue

This is only active credit amount overdue

In [None]:
bureau["active_AMT_CREDIT_SUM_OVERDUE"] = bureau["AMT_CREDIT_SUM_OVERDUE"] * bureau["credit_active_dummy"]
active_total_amount_of_credit_overdue = aggregate_variable("AMT_CREDIT_SUM_OVERDUE", "active_total_amount_of_credit_overdue", np.sum)
active_total_amount_of_credit_overdue

### Credit Type

In [None]:
bureau.CREDIT_TYPE.value_counts()

Approach ideas:
- what kind of other credits taken?
- income generating vs consumer credits taken?

I think income generating vs consumer is best

In [None]:
income_generating_credits = ('Loan for business development', 'Loan for working capital replenishment', 'Real estate loan', 'Loan for the purchase of equipment', 'Loan for purchase of shares (margin lending)')

In [None]:
bureau["income_generating_credit"] = [1 if d in income_generating_credits else 0 for d in bureau.CREDIT_TYPE]
number_of_income_generating_credits = bureau[["SK_ID_CURR", "income_generating_credit"]].groupby("SK_ID_CURR").agg(np.sum)
number_of_income_generating_credits.columns = ["number_of_income_generating_credits"]

share_of_income_generating_credits = pd.concat([number_of_income_generating_credits, number_of_credits_registered], axis = 1)

share_of_income_generating_credits["share_of_income_generating_credits"] = share_of_income_generating_credits["number_of_income_generating_credits"] / share_of_income_generating_credits["number_of_credits_registered"]
share_of_income_generating_credits = share_of_income_generating_credits["share_of_income_generating_credits"]
share_of_income_generating_credits

# Bureau balance table

## Load data

In [None]:
bureau_balance = pd.read_csv("../data/bureau_balance.csv")
bureau_balance

- Recoded ID of Credit Bureau credit (unique coding for each application) use this to join to CREDIT_BUREAU table
- Month of balance relative to application date (-1 means the freshest balance date)
- Status of Credit Bureau loan during the month (active, closed, DPD0-30,� [C means closed, X means status unknown, 0 means no DPD, 1 means maximal did during month between 1-30, 2 means DPD 31-60,� 5 means DPD 120+ or sold or written off ] )

## Overview

In [None]:
print("Percentage of rows empty: ")
print("")
print(bureau_balance.isna().sum() / len(bureau_balance.SK_ID_BUREAU) * 100)

In [None]:
link_table = bureau[["SK_ID_CURR", "SK_ID_BUREAU"]].drop_duplicates()
link_table

In [None]:
bureau_balance.STATUS.value_counts()

## Feature creation

In [None]:
bureau_balance["defaulted"] = [1 if d != "0|1|2|3|C" else 0 for d in bureau_balance["STATUS"]]
bureau_balance["year_before_application"] = [1 if d > -13 else 0 for d in bureau_balance['MONTHS_BALANCE']]
bureau_balance

In [None]:
defaulted_last_year_bureau = bureau_balance[bureau_balance["year_before_application"] == 1][["SK_ID_BUREAU", "defaulted"]].groupby("SK_ID_BUREAU").agg(np.sum)
defaulted_last_year_bureau.columns = ["defaulted_last_year_bureau"]
defaulted_last_year_bureau["defaulted_last_year_bureau"] = [1 if d > 0 else 0 for d in defaulted_last_year_bureau["defaulted_last_year_bureau"]]
defaulted_last_year_bureau = defaulted_last_year_bureau.merge(link_table, on = "SK_ID_BUREAU", how = "left")[["SK_ID_CURR", "defaulted_last_year_bureau"]]

defaulted_last_year_bureau

In [None]:
defaulted_last_year = defaulted_last_year_bureau.groupby("SK_ID_CURR").agg(np.sum)
defaulted_last_year["defaulted_last_year"] = [1 if d > 0 else 0 for d in defaulted_last_year["defaulted_last_year_bureau"]]
defaulted_last_year.rename({"defaulted_last_year_bureau" : "number_of_credits_defaulted_last_year"}, inplace = True)
defaulted_last_year

# Getting features from both tables together

## Create dataframe with only IDs

In [None]:
bureau_reduced = pd.DataFrame(bureau.SK_ID_CURR.unique())
bureau_reduced.columns = ["SK_ID_CURR"]
bureau_reduced

## Merge all features based on ID

In [None]:
feature_list = [
    share_of_active_credits_of_total_credits,
    number_of_credits_registered,
    last_credit_application_before_mortgage,
    std_days_overdue_on_one_credit,
    total_amount_of_credit_taken,
    active_total_amount_of_credit_taken,
    total_amount_of_credit_overdue,
    active_total_amount_of_credit_overdue,
    share_of_income_generating_credits,
    defaulted_last_year,    
]

for feature in feature_list:
    bureau_reduced = bureau_reduced.merge(feature, on ='SK_ID_CURR', how = 'left')

## Overview of final table

In [None]:
bureau_reduced.isna().sum()

In [None]:
bureau_reduced