In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display, HTML

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from _preprocessing import change_dtypes
from _preprocessing import aggregate
from _stats import f_ratio
from _stats import corrwith

# 1. `application`

In [2]:
application_train = pd.read_csv("data/download/application_train.csv")
application_train = change_dtypes(application_train)

print("application_train.shape:", application_train.shape)
application_train.head()

Memory usage before changing types 300.13 MB
Memory usage after changing types 129.78 MB
application_train.shape: (307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,True,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,False,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,False,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,False,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,False,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


## Missing values

In [None]:
missing_vals = application_train.isnull().mean()
missing_vals = missing_vals.sort_values(ascending=False)
print("Total number of features:", application_train.shape[1])
print("Number of features having missing values:", (missing_vals > 0).sum())

print("head:")
display(missing_vals.head(20))
print("tail")
display(missing_vals[missing_vals > 0].tail(20))

## Categorical columns

We group the `TARGET` by a categorical feature. `TARGET` in each group is aggregated by `mean`, `var` and `count`. The resulting `group mean`, `group variance` and `group count` are used to calculate the F-ratio which is define as the `between-group variance` divided by `within-group variance`.

In [None]:
def f_ratio_df(df):
    return f_ratio(df["default_rate"], df["var_of_default_rate"], df["count"])

In [None]:
# columns of categorical type
cat_cols = application_train.select_dtypes(["category"]).columns
agg_stats = [("default_rate", "mean"), ("var_of_default_rate", np.var), "count"]

results = []
for col in cat_cols:
    df = application_train.groupby(
        application_train[col].astype(str))["TARGET"].agg(agg_stats)
    df = df.sort_values("default_rate", ascending=False)
    results.append((col, df))

# sort by the F-ratio
results = sorted(results, key=lambda x: f_ratio_df(x[1]), reverse=True)
for col, df in results:
    print("%s, f-ratio=%0.5f" % (col, f_ratio_df(df)))
    display(df)
    print("------------------------------------------\n\n")

**For `NAME_INCOME_TYPE` we can try to reduce number of class by grouping classes as follows**

[Maternity leave, Unemployed] -> curr_not_working

[Businessman, Student] -> other

**For NAME_INCOME_TYPE we can group as follows**

[Businessman, Student] -> Other_A

[Unemployed, Maternity leave] -> Other_B

## Binary columns

In [None]:
bool_cols = application_train.drop(["TARGET"], axis=1).select_dtypes(["bool"]).columns

agg_stats = [("default_rate", "mean"), ("var_of_default_rate", np.var), "count"]

results = []
for col in bool_cols:
    df = application_train.groupby(
        application_train[col].astype(str))["TARGET"].agg(agg_stats)
    df = df.sort_values("default_rate", ascending=False)
    results.append((col, df))

# sort by the F-ratio of default_rate
results = sorted(results, key=lambda x: f_ratio_df(x[1]), reverse=True)
for col, df in results:
    print("%s, f-ratio=%0.5f" % (col, f_ratio_df(df)))
    display(df)
    print("------------------------------------------\n\n")

## Correlation of numerical features with the target

In [None]:
num_cols = application_train.drop(["TARGET", "SK_ID_CURR"] + list(cat_cols) + list(bool_cols), axis=1).columns

corrs = application_train.loc[:, num_cols].corrwith(application_train["TARGET"])
corrs = corrwith(application_train.loc[:, num_cols], application_train["TARGET"])

corrs.head(20)

In [None]:
corrs.tail(10)

## Histogram of numerical features

In [None]:
fig, axes = plt.subplots(nrows=25, ncols=4, figsize=(16, 120))
plt.subplots_adjust(hspace=0.3)
axes = axes.flatten()

for i, col in enumerate(corrs.index):
    application_train[col].plot(kind="hist", ax=axes[i])
    axes[i].set_title(col)

Some features are very screwed and range over many orders of magnitude such as `AMT_GOODS_PRICE`, `AMT_CREDIT`, `AMT_ANNUITY`, `AMT_INCOME_TOTAL`. May consider log transforming them.

`DAYS_EMPLOYED` are supposed to be negative but there is a very large positive value.

In [None]:
days_emp_max = application_train["DAYS_EMPLOYED"].max()
print("days_emp_max:", days_emp_max)

print("Count of the maximun value:", sum(application_train["DAYS_EMPLOYED"] == days_emp_max))
print("Count of positive values:", sum(application_train["DAYS_EMPLOYED"] > 0))

default_rate_pos = application_train.loc[application_train["DAYS_EMPLOYED"] == days_emp_max, "TARGET"].mean()
print("Default rate for positive DAYS_EMPLOYED: %0.5f" % default_rate_pos)

default_rate_neg = application_train.loc[application_train["DAYS_EMPLOYED"] < days_emp_max, "TARGET"].mean()
print("Default rate for negative DAYS_EMPLOYED: %0.5f" % default_rate_neg)


The difference is quite significant. So we may consider adding a binary column which indicates whether `DAYS_EMPLOYED` is positive. Also we may change the maximum to 1.

## Some engineered features

In [None]:
cols = ["AMT_GOODS_PRICE", "AMT_CREDIT", "AMT_ANNUITY", "AMT_INCOME_TOTAL"]
log_transformed_features = application_train[cols]
for col in cols:
    print(col, (log_transformed_features[col] < 0).any())

for col in cols:
    log_transformed_features[col + "_log"] = np.log(log_transformed_features[col] + 1)

corrs = corrwith(log_transformed_features, application_train["TARGET"])
corrs

It looks like only AMT_INCOME_TOTAL_log improves the correlation. 

In [None]:
credit_to_income = application_train["AMT_CREDIT"] / application_train["AMT_INCOME_TOTAL"]
credit_to_income.corr(application_train["TARGET"])

In [None]:
# this may be a useful feature
credit_to_goods = application_train["AMT_CREDIT"] / application_train["AMT_GOODS_PRICE"]
credit_to_goods.corr(application_train["TARGET"])

# 2. bureau

In [3]:
bureau = pd.read_csv("data/download/bureau.csv")
bureau = change_dtypes(bureau)

print("bureau.shape:", bureau.shape)
bureau.head()

Memory usage before changing types 233.43 MB
Memory usage after changing types 101.27 MB
bureau.shape: (1716428, 17)


Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


## Missing values

In [None]:
missing_vals = bureau.isnull().mean()
missing_vals = missing_vals.sort_values(ascending=False)
print("Total number of features:", bureau.shape[1])
print("Number of features having missing values:", (missing_vals > 0).sum())

print("head:")
display(missing_vals.head(10))

## Correlation of aggregated features with `TARGET`

### Numerical features

In [6]:
# aggregate numerical columns with count

bureau_agg = aggregate(bureau, by=["SK_ID_CURR"], dtype="num", num_stats=("count",))
print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")
bureau_agg = bureau_agg.fillna(0)

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

Drop 6 collinear columns
bureau_agg shape: (305811, 8)


feature
DAYS_ENDDATE_FACT_count        -0.036969
DAYS_CREDIT_ENDDATE_count      -0.016496
SK_ID_BUREAU_count             -0.010020
AMT_CREDIT_SUM_LIMIT_count     -0.009552
AMT_ANNUITY_count               0.008354
AMT_CREDIT_SUM_DEBT_count      -0.007407
AMT_CREDIT_MAX_OVERDUE_count   -0.002313
Name: corr, dtype: float64

In [8]:
# aggregate numerical columns with mean

bureau_agg = aggregate(bureau, by=["SK_ID_CURR"], dtype="num", num_stats=("mean",))

print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

Drop 0 collinear columns
bureau_agg shape: (305811, 14)


feature
DAYS_CREDIT_mean               0.089729
DAYS_CREDIT_UPDATE_mean        0.068927
DAYS_ENDDATE_FACT_mean         0.053200
DAYS_CREDIT_ENDDATE_mean       0.046983
AMT_CREDIT_SUM_mean           -0.019957
SK_ID_BUREAU_mean             -0.011792
AMT_CREDIT_SUM_LIMIT_mean     -0.011446
CREDIT_DAY_OVERDUE_mean        0.008118
AMT_CREDIT_SUM_OVERDUE_mean    0.007150
CNT_CREDIT_PROLONG_mean        0.003031
AMT_CREDIT_MAX_OVERDUE_mean    0.002435
AMT_ANNUITY_mean              -0.001391
AMT_CREDIT_SUM_DEBT_mean      -0.000637
Name: corr, dtype: float64

In [9]:
# aggregate numerical columns with median

bureau_agg = aggregate(bureau, by=["SK_ID_CURR"], dtype="num", num_stats=("median",))

print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

Drop 0 collinear columns
bureau_agg shape: (305811, 14)


feature
DAYS_CREDIT_median               0.085414
DAYS_CREDIT_UPDATE_median        0.067338
DAYS_ENDDATE_FACT_median         0.050603
DAYS_CREDIT_ENDDATE_median       0.039929
AMT_CREDIT_SUM_median           -0.015676
SK_ID_BUREAU_median             -0.011712
CREDIT_DAY_OVERDUE_median        0.007847
AMT_CREDIT_SUM_LIMIT_median     -0.006510
AMT_CREDIT_SUM_OVERDUE_median    0.003785
AMT_CREDIT_MAX_OVERDUE_median    0.002251
AMT_ANNUITY_median              -0.002110
AMT_CREDIT_SUM_DEBT_median       0.001666
CNT_CREDIT_PROLONG_median        0.001477
Name: corr, dtype: float64

In [10]:
# aggregate numerical columns with variance

bureau_agg = aggregate(bureau, by=["SK_ID_CURR"], dtype="num", num_stats=(np.var,))

print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

Drop 0 collinear columns
bureau_agg shape: (305811, 14)


feature
DAYS_CREDIT_var              -0.038440
DAYS_CREDIT_ENDDATE_var       0.037370
DAYS_ENDDATE_FACT_var        -0.017603
SK_ID_BUREAU_var             -0.008212
AMT_CREDIT_SUM_LIMIT_var     -0.007636
AMT_CREDIT_SUM_DEBT_var      -0.005604
AMT_CREDIT_SUM_OVERDUE_var    0.002222
CREDIT_DAY_OVERDUE_var        0.001516
AMT_CREDIT_SUM_var           -0.001435
AMT_ANNUITY_var               0.001413
DAYS_CREDIT_UPDATE_var        0.001366
AMT_CREDIT_MAX_OVERDUE_var   -0.000870
CNT_CREDIT_PROLONG_var        0.000426
Name: corr, dtype: float64

In [11]:
# aggregate numerical columns with min

bureau_agg = aggregate(bureau, by=["SK_ID_CURR"], dtype="num", num_stats=("min",))

print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

Drop 0 collinear columns
bureau_agg shape: (305811, 14)


feature
DAYS_CREDIT_min               0.075248
DAYS_ENDDATE_FACT_min         0.055887
DAYS_CREDIT_UPDATE_min        0.042864
DAYS_CREDIT_ENDDATE_min       0.034281
AMT_CREDIT_SUM_min           -0.010764
SK_ID_BUREAU_min             -0.010359
CREDIT_DAY_OVERDUE_min        0.007573
AMT_CREDIT_SUM_LIMIT_min     -0.004779
AMT_ANNUITY_min              -0.002542
AMT_CREDIT_MAX_OVERDUE_min    0.002329
AMT_CREDIT_SUM_DEBT_min       0.000242
CNT_CREDIT_PROLONG_min       -0.000182
AMT_CREDIT_SUM_OVERDUE_min    0.000003
Name: corr, dtype: float64

In [12]:
# aggregate numerical columns with max

bureau_agg = aggregate(bureau, by=["SK_ID_CURR"], dtype="num", num_stats=("max",))

print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

Drop 0 collinear columns
bureau_agg shape: (305811, 14)


feature
DAYS_CREDIT_max               0.049782
DAYS_CREDIT_ENDDATE_max       0.036590
DAYS_CREDIT_UPDATE_max        0.028234
AMT_CREDIT_SUM_max           -0.019737
DAYS_ENDDATE_FACT_max         0.019644
SK_ID_BUREAU_max             -0.012658
AMT_CREDIT_SUM_LIMIT_max     -0.010633
AMT_CREDIT_SUM_OVERDUE_max    0.010614
CREDIT_DAY_OVERDUE_max        0.005493
CNT_CREDIT_PROLONG_max        0.003951
AMT_CREDIT_MAX_OVERDUE_max    0.002540
AMT_CREDIT_SUM_DEBT_max      -0.002173
AMT_ANNUITY_max               0.001120
Name: corr, dtype: float64

### Categorical features

In [16]:
# aggregate categorical columns with sum

bureau_agg = aggregate(bureau, by=["SK_ID_CURR"], dtype="cat", cat_stats=("sum",))
print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")
#bureau_agg = bureau_agg.fillna(0)

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

Drop 0 collinear columns
bureau_agg shape: (305811, 24)


feature
CREDIT_ACTIVE_Active_sum                                        0.067128
CREDIT_TYPE_Credit card_sum                                     0.034818
CREDIT_TYPE_Microloan_sum                                       0.034110
CREDIT_ACTIVE_Closed_sum                                       -0.030812
CREDIT_TYPE_Mortgage_sum                                       -0.023307
CREDIT_TYPE_Car loan_sum                                       -0.020817
CREDIT_ACTIVE_Sold_sum                                          0.012058
CREDIT_TYPE_Consumer credit_sum                                -0.010707
CREDIT_TYPE_Loan for working capital replenishment_sum          0.006018
CREDIT_CURRENCY_currency 2_sum                                 -0.006003
CREDIT_TYPE_Loan for the purchase of equipment_sum              0.004495
CREDIT_CURRENCY_currency 1_sum                                  0.004154
CREDIT_ACTIVE_Bad debt_sum                                      0.004003
CREDIT_TYPE_Loan for business development_s

In [17]:
# aggregate categorical columns with mean

bureau_agg = aggregate(bureau, by=["SK_ID_CURR"], dtype="cat", cat_stats=("mean",))
print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

Drop 0 collinear columns
bureau_agg shape: (305811, 24)


feature
CREDIT_ACTIVE_Closed_mean                                       -0.079369
CREDIT_ACTIVE_Active_mean                                        0.077356
CREDIT_TYPE_Microloan_mean                                       0.044439
CREDIT_TYPE_Credit card_mean                                     0.034684
CREDIT_TYPE_Consumer credit_mean                                -0.026258
CREDIT_TYPE_Mortgage_mean                                       -0.020867
CREDIT_TYPE_Car loan_mean                                       -0.020134
CREDIT_ACTIVE_Sold_mean                                          0.016530
CREDIT_CURRENCY_currency 2_mean                                 -0.006821
CREDIT_CURRENCY_currency 1_mean                                  0.006037
CREDIT_ACTIVE_Bad debt_mean                                      0.004638
CREDIT_CURRENCY_currency 3_mean                                  0.002783
CREDIT_TYPE_Loan for business development_mean                  -0.002759
CREDIT_TYPE_Unknown type of lo

In [22]:
# aggregate categorical columns with number of unique values

bureau_agg = aggregate(bureau, by=["SK_ID_CURR"], dtype="cat", cat_stats=("nunique",), onehot_encode=False)
print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")
bureau_agg = bureau_agg.fillna(0)

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

Drop 0 collinear columns
bureau_agg shape: (305811, 4)


feature
CREDIT_CURRENCY_nunique   -0.031060
CREDIT_ACTIVE_nunique     -0.025051
CREDIT_TYPE_nunique       -0.019160
Name: corr, dtype: float64

### Engineered features

#### Count and percent of number of times DPD are over 1, 3, and 6 months

In [None]:
# agg by mean

credit_day_overdue = bureau[["SK_ID_CURR"]]
credit_day_overdue["CREDIT_DAY_OVERDUE_OVER_0M"] = bureau["CREDIT_DAY_OVERDUE"] == 0
credit_day_overdue["CREDIT_DAY_OVERDUE_OVER_1M"] = bureau["CREDIT_DAY_OVERDUE"] > 30
credit_day_overdue["CREDIT_DAY_OVERDUE_OVER_3M"] = bureau["CREDIT_DAY_OVERDUE"] > 90
credit_day_overdue["CREDIT_DAY_OVERDUE_OVER_6M"] = bureau["CREDIT_DAY_OVERDUE"] > 120

credit_day_overdue_agg = aggregate(credit_day_overdue, by=["SK_ID_CURR"], cat_stats=("mean",))
print("credit_day_overdue_agg shape:", credit_day_overdue_agg.shape)

credit_day_overdue_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(credit_day_overdue_agg, 
                                                                           how="left", on="SK_ID_CURR")

corrs = corrwith(credit_day_overdue_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), credit_day_overdue_agg["TARGET"])
corrs

In [None]:
# agg by sum

credit_day_overdue_agg = aggregate(credit_day_overdue, by=["SK_ID_CURR"], cat_stats=("sum",))
print("credit_day_overdue_agg shape:", credit_day_overdue_agg.shape)

credit_day_overdue_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(credit_day_overdue_agg, 
                                                                           how="left", on="SK_ID_CURR")
credit_day_overdue_agg = credit_day_overdue_agg.fillna(0)
corrs = corrwith(credit_day_overdue_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), credit_day_overdue_agg["TARGET"])
corrs

#### Whether `DAYS_CREDIT_ENDDATE` is positive

In [None]:
# agg by mean

days_credit_enddate_pos = bureau[["SK_ID_CURR"]]
days_credit_enddate_pos["DAYS_CREDIT_ENDDATE_POS"] = bureau["DAYS_CREDIT_ENDDATE"] > 0

days_credit_enddate_pos_agg = aggregate(days_credit_enddate_pos, by=["SK_ID_CURR"], cat_stats=("mean",))

print("days_credit_enddate_pos_agg shape:", days_credit_enddate_pos_agg.shape)

days_credit_enddate_pos_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(days_credit_enddate_pos_agg, 
                                                                           how="left", on="SK_ID_CURR")

corrs = corrwith(days_credit_enddate_pos_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), 
                 days_credit_enddate_pos_agg["TARGET"])
corrs

In [None]:
# agg by sum
days_credit_enddate_pos_agg = aggregate(days_credit_enddate_pos, by=["SK_ID_CURR"], cat_stats=("sum",))

print("days_credit_enddate_pos_agg shape:", days_credit_enddate_pos_agg.shape)

days_credit_enddate_pos_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(days_credit_enddate_pos_agg, 
                                                                           how="left", on="SK_ID_CURR")
days_credit_enddate_pos_agg = days_credit_enddate_pos_agg.fillna(0)

corrs = corrwith(days_credit_enddate_pos_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), 
                 days_credit_enddate_pos_agg["TARGET"])
corrs

#### Whether `DAYS_CREDIT_UPDATE` is positive

In [None]:
# very few cases 
(bureau["DAYS_CREDIT_UPDATE"] > 0).sum()

### Debt to Credit ratio and Total overdue to debt ratio

In [None]:
amt_agg = aggregate(bureau[["SK_ID_CURR", "AMT_CREDIT_SUM", "AMT_CREDIT_SUM_DEBT", "AMT_CREDIT_SUM_OVERDUE"]],
                   by=["SK_ID_CURR"], num_stats=("sum",))

amt_agg["DEBT_TO_CREDIT"] = amt_agg["AMT_CREDIT_SUM_DEBT_sum"] / amt_agg["AMT_CREDIT_SUM_sum"]
amt_agg["OVERDUE_TO_DEBT"] = amt_agg["AMT_CREDIT_SUM_OVERDUE_sum"] / amt_agg["AMT_CREDIT_SUM_DEBT_sum"]

amt_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(amt_agg, how="left", on="SK_ID_CURR")

amt_agg["DEBT_TO_CREDIT"] = amt_agg["DEBT_TO_CREDIT"].replace({np.inf: np.nan, -np.inf: np.nan})

amt_agg["OVERDUE_TO_DEBT"] = amt_agg["OVERDUE_TO_DEBT"].replace({np.inf: np.nan, -np.inf: np.nan})

corrs = corrwith(amt_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), amt_agg["TARGET"])
corrs

#### Time between successive loans

In [None]:
def mean_diff(x):
    return x.sort_values().diff().mean()

def var_diff(x):
    return x.sort_values().diff().var()

def min_diff(x):
    return x.sort_values().diff().min()

def max_diff(x):
    return x.sort_values().diff().max()

def range_diff(x):
    return max_diff(x) - min_diff(x)

In [None]:
time_bet_loans = aggregate(bureau[["SK_ID_CURR", "DAYS_CREDIT"]], by=["SK_ID_CURR"], 
                           num_stats=(mean_diff, var_diff, range_diff,))
time_bet_loans = application_train[["SK_ID_CURR", "TARGET"]].merge(time_bet_loans, how="left", on="SK_ID_CURR")

In [None]:
for col in time_bet_loans.columns:
    time_bet_loans[col] = time_bet_loans[col].replace({np.inf: np.nan, -np.inf: np.nan})
    
corrs = corrwith(time_bet_loans.drop(["SK_ID_CURR", "TARGET"], axis=1), time_bet_loans["TARGET"])
corrs

# 3. `bureau_balance`

In [23]:
bureau_balance = pd.read_csv("data/download/bureau_balance.csv")
bureau_balance = change_dtypes(bureau_balance)

print("bureau_balance.shape:", bureau_balance.shape)
bureau_balance.head()

Memory usage before changing types 655.20 MB
Memory usage after changing types 245.70 MB
bureau_balance.shape: (27299925, 3)


Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [24]:
print("MONTHS_BALANCE min:", bureau_balance["MONTHS_BALANCE"].min())
print("MONTHS_BALANCE max:", bureau_balance["MONTHS_BALANCE"].max())

print("STATUS unique values:", bureau_balance["STATUS"].unique())

MONTHS_BALANCE min: -96
MONTHS_BALANCE max: 0
STATUS unique values: [C, 0, X, 1, 2, 3, 5, 4]
Categories (8, object): [C, 0, X, 1, 2, 3, 5, 4]


## Missing values

In [None]:
missing_vals = bureau_balance.isnull().mean()
missing_vals = missing_vals.sort_values(ascending=False)
print("Total number of features:", bureau_balance.shape[1])
print("Number of features having missing values:", (missing_vals > 0).sum())

print("head:")
display(missing_vals.head(10))

## Aggregate once

In [29]:

bureau_balance_merge = application_train[["SK_ID_CURR"]].merge(bureau[["SK_ID_CURR", "SK_ID_BUREAU"]], 
                                                                      how="left", on="SK_ID_CURR")

bureau_balance_merge = bureau_balance_merge.merge(bureau_balance, how="left", on="SK_ID_BUREAU")
bureau_balance_merge = bureau_balance_merge.drop(["SK_ID_BUREAU"], axis=1)

bureau_balance_agg = aggregate(bureau_balance_merge, by=["SK_ID_CURR"], 
                               num_stats=("count", "mean", "min", "max", np.var),
                              cat_stats=("sum", "mean"))

bureau_balance_agg_uniq = aggregate(bureau_balance_merge, by=["SK_ID_CURR"], dtype="cat", 
                                    cat_stats=("nunique",))

bureau_balance_agg = bureau_balance_agg.merge(bureau_balance_agg_uniq, how="outer", on="SK_ID_CURR")

print("bureau_balance_agg shape:", bureau_balance_agg.shape)

bureau_balance_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_balance_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_balance_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_balance_agg["TARGET"])
corrs

Drop 0 collinear columns
Drop 0 collinear columns
bureau_balance_agg shape: (307511, 30)


feature
MONTHS_BALANCE_mean     0.074416
MONTHS_BALANCE_min      0.073225
MONTHS_BALANCE_var     -0.051335
STATUS_1_mean           0.033021
STATUS_1_nunique        0.023553
STATUS_C_sum           -0.021756
STATUS_C_mean          -0.017915
MONTHS_BALANCE_count   -0.017482
STATUS_0_mean           0.016825
STATUS_1_sum            0.013991
STATUS_2_mean           0.011150
STATUS_X_sum           -0.011022
STATUS_5_mean           0.010525
STATUS_3_mean           0.010089
STATUS_4_nunique        0.010085
STATUS_4_mean           0.009940
STATUS_C_nunique       -0.009337
STATUS_5_nunique        0.009054
STATUS_3_nunique        0.008935
STATUS_4_sum            0.008585
STATUS_2_nunique        0.008455
STATUS_3_sum            0.007912
STATUS_2_sum            0.007755
STATUS_0_sum           -0.007233
STATUS_5_sum            0.005691
MONTHS_BALANCE_max      0.005119
STATUS_X_nunique        0.002118
STATUS_0_nunique        0.000558
STATUS_X_mean           0.000528
Name: corr, dtype: float64

## Aggregate stepwise

In [30]:
bureau_balance_agg = aggregate(bureau_balance, by=["SK_ID_BUREAU"], 
                               num_stats=("count", "mean", "min", "max", np.var),
                               cat_stats=("sum", "mean"))

bureau_balance_agg_uniq = aggregate(bureau_balance, by=["SK_ID_BUREAU"], dtype="cat",
                                    cat_stats=("nunique",), onehot_encode=False)

bureau_balance_agg = bureau_balance_agg.merge(bureau_balance_agg_uniq, how="outer", on="SK_ID_BUREAU")

bureau_balance_agg = bureau[["SK_ID_CURR", "SK_ID_BUREAU"]].merge(bureau_balance_agg, how="left", on="SK_ID_BUREAU")

bureau_balance_agg = bureau_balance_agg.drop(["SK_ID_BUREAU"], axis=1)

bureau_balance_agg = aggregate(bureau_balance_agg, by=["SK_ID_CURR"], 
                              num_stats=("count", "mean", "min", "max", np.var),
                              cat_stats=("sum", "mean"))

print("bureau_balance_agg shape:", bureau_balance_agg.shape)

bureau_balance_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_balance_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_balance_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_balance_agg["TARGET"])
corrs

Drop 0 collinear columns
Drop 0 collinear columns
No categorical columns in df
Drop 20 collinear columns
bureau_balance_agg shape: (305811, 91)


feature
MONTHS_BALANCE_min_mean       0.089038
MONTHS_BALANCE_count_mean    -0.080193
MONTHS_BALANCE_mean_mean      0.076424
MONTHS_BALANCE_min_min        0.073225
MONTHS_BALANCE_count_max     -0.068792
MONTHS_BALANCE_var_mean      -0.062957
STATUS_C_sum_mean            -0.062954
STATUS_1_mean_mean            0.061183
STATUS_1_mean_max             0.061122
STATUS_C_sum_max             -0.061083
MONTHS_BALANCE_var_max       -0.059855
STATUS_C_mean_mean           -0.055936
MONTHS_BALANCE_mean_min       0.053816
MONTHS_BALANCE_min_max        0.049807
MONTHS_BALANCE_count_min     -0.048224
STATUS_C_sum_var             -0.046157
STATUS_0_mean_mean            0.044198
STATUS_C_mean_max            -0.044119
MONTHS_BALANCE_mean_max       0.043956
MONTHS_BALANCE_var_var       -0.040304
MONTHS_BALANCE_count_var     -0.037909
STATUS_1_mean_var             0.037443
STATUS_1_mean_min             0.037332
STATUS_0_sum_mean            -0.035868
MONTHS_BALANCE_min_var       -0.035045
STATUS_X_sum_mean