In [None]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import os
from IPython.display import display, HTML

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from _preprocessing import change_dtypes
from _preprocessing import aggregate
from _stats import f_ratio
from _stats import corrwith
from _stats import mode

INP_DIR = "data/download"

# 1. `application`

In [None]:
application_train = pd.read_csv(os.path.join(INP_DIR, "application_train.csv"))
application_train = change_dtypes(application_train)

print("application_train.shape:", application_train.shape)
application_train.head()

## Missing values

In [None]:
missing_vals = application_train.isnull().mean()
missing_vals = missing_vals.sort_values(ascending=False)
print("Total number of features:", application_train.shape[1])
print("Number of features having missing values:", (missing_vals > 0).sum())

print("head:")
display(missing_vals.head(20))
print("tail")
display(missing_vals[missing_vals > 0].tail(20))

## Categorical columns

We group the `TARGET` by a categorical feature. `TARGET` in each group is aggregated by `mean`, `var` and `count`. The resulting `group mean`, `group variance` and `group count` are used to calculate the F-ratio which is define as the `between-group variance` divided by `within-group variance`.

In [None]:
def f_ratio_df(df):
    return f_ratio(df["default_rate"], df["var_of_default_rate"], df["count"])

In [None]:
# columns of categorical type
cat_cols = application_train.select_dtypes(["category"]).columns
agg_stats = [("default_rate", "mean"), ("var_of_default_rate", np.var), "count"]

results = []
for col in cat_cols:
    df = application_train.groupby(
        application_train[col].astype(str))["TARGET"].agg(agg_stats)
    df = df.sort_values("default_rate", ascending=False)
    results.append((col, df))

# sort by the F-ratio
results = sorted(results, key=lambda x: f_ratio_df(x[1]), reverse=True)
for col, df in results:
    print("%s, f-ratio=%0.5f" % (col, f_ratio_df(df)))
    display(df)
    print("------------------------------------------\n\n")

## Binary columns

In [None]:
bool_cols = application_train.drop(["TARGET"], axis=1).select_dtypes(["bool"]).columns

agg_stats = [("default_rate", "mean"), ("var_of_default_rate", np.var), "count"]

results = []
for col in bool_cols:
    df = application_train.groupby(
        application_train[col].astype(str))["TARGET"].agg(agg_stats)
    df = df.sort_values("default_rate", ascending=False)
    results.append((col, df))

# sort by the F-ratio of default_rate
results = sorted(results, key=lambda x: f_ratio_df(x[1]), reverse=True)
for col, df in results:
    print("%s, f-ratio=%0.5f" % (col, f_ratio_df(df)))
    display(df)
    print("------------------------------------------\n\n")

## Correlation of numerical features with the target

In [None]:
num_cols = application_train.drop(["TARGET", "SK_ID_CURR"] + list(cat_cols) + list(bool_cols), axis=1).columns

corrs = application_train.loc[:, num_cols].corrwith(application_train["TARGET"])
corrs = corrwith(application_train.loc[:, num_cols], application_train["TARGET"])

corrs.head(20)

In [None]:
corrs.tail(10)

## Histogram of numerical features

In [None]:
fig, axes = plt.subplots(nrows=25, ncols=4, figsize=(16, 120))
plt.subplots_adjust(hspace=0.3)
axes = axes.flatten()

for i, col in enumerate(corrs.index):
    application_train[col].plot(kind="hist", ax=axes[i])
    axes[i].set_title(col)

Some features are very screwed and range over many orders of magnitude such as `AMT_GOODS_PRICE`, `AMT_CREDIT`, `AMT_ANNUITY`, `AMT_INCOME_TOTAL`. May consider log transforming them.

`DAYS_EMPLOYED` are supposed to be negative but there is a very large positive value.

In [None]:
days_emp_max = application_train["DAYS_EMPLOYED"].max()
print("days_emp_max:", days_emp_max)

print("Count of the maximun value:", sum(application_train["DAYS_EMPLOYED"] == days_emp_max))
print("Count of positive values:", sum(application_train["DAYS_EMPLOYED"] > 0))

default_rate_pos = application_train.loc[application_train["DAYS_EMPLOYED"] == days_emp_max, "TARGET"].mean()
print("Default rate for positive DAYS_EMPLOYED: %0.5f" % default_rate_pos)

default_rate_neg = application_train.loc[application_train["DAYS_EMPLOYED"] < days_emp_max, "TARGET"].mean()
print("Default rate for negative DAYS_EMPLOYED: %0.5f" % default_rate_neg)


The difference is quite significant. So we may consider adding a binary column which indicates whether `DAYS_EMPLOYED` is positive. Also we may change the maximum to 1 or `np.nan`.

## Some engineered features

In [None]:
cols = ["AMT_GOODS_PRICE", "AMT_CREDIT", "AMT_ANNUITY", "AMT_INCOME_TOTAL"]
log_transformed_features = application_train[cols]
for col in cols:
    print(col, (log_transformed_features[col] < 0).any())

for col in cols:
    log_transformed_features[col + "_log"] = np.log(log_transformed_features[col] + 1)

corrs = corrwith(log_transformed_features, application_train["TARGET"])
corrs

It looks like only AMT_INCOME_TOTAL_log improves the correlation. 

In [None]:
credit_to_income = application_train["AMT_CREDIT"] / application_train["AMT_INCOME_TOTAL"]
credit_to_income.corr(application_train["TARGET"])

In [None]:
# this may be a useful feature
credit_to_goods = application_train["AMT_CREDIT"] / application_train["AMT_GOODS_PRICE"]
credit_to_goods.corr(application_train["TARGET"])

# 2. bureau

In [None]:
bureau = pd.read_csv(os.path.join(INP_DIR, "bureau.csv"))
bureau = change_dtypes(bureau)

print("bureau.shape:", bureau.shape)
bureau.head()

## Missing values

In [None]:
missing_vals = bureau.isnull().mean()
missing_vals = missing_vals.sort_values(ascending=False)
print("Total number of features:", bureau.shape[1])
print("Number of features having missing values:", (missing_vals > 0).sum())

print("head:")
display(missing_vals.head(10))

## Correlation of aggregated features with `TARGET`

### Numerical features

In [None]:
# aggregate numerical columns with count

bureau_agg = aggregate(bureau.drop(["SK_ID_BUREAU"], axis=1), 
                       by=["SK_ID_CURR"], dtype="num", num_stats=("count",))
print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")
bureau_agg = bureau_agg.fillna(0)

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

In [None]:
# aggregate numerical columns with mean

bureau_agg = aggregate(bureau.drop(["SK_ID_BUREAU"], axis=1), 
                       by=["SK_ID_CURR"], dtype="num", num_stats=("mean",))

print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

In [None]:
# aggregate numerical columns with median

bureau_agg = aggregate(bureau.drop(["SK_ID_BUREAU"], axis=1), 
                       by=["SK_ID_CURR"], dtype="num", num_stats=("median",))

print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

In [None]:
# aggregate numerical columns with variance

bureau_agg = aggregate(bureau.drop(["SK_ID_BUREAU"], axis=1), 
                       by=["SK_ID_CURR"], dtype="num", num_stats=(np.var,))

print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

In [None]:
# aggregate numerical columns with min

bureau_agg = aggregate(bureau.drop(["SK_ID_BUREAU"], axis=1), 
                       by=["SK_ID_CURR"], dtype="num", num_stats=("min",))

print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

In [None]:
# aggregate numerical columns with max

bureau_agg = aggregate(bureau.drop(["SK_ID_BUREAU"], axis=1), 
                       by=["SK_ID_CURR"], dtype="num", num_stats=("max",))

print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

### Categorical features

In [None]:
cat_cols = bureau.select_dtypes(["category", "object", "bool"]).columns
for col in cat_cols:
    print(col, bureau[col].nunique())

In [None]:
# aggregate categorical columns with sum

bureau_agg = aggregate(bureau.drop(["SK_ID_BUREAU"], axis=1), 
                       by=["SK_ID_CURR"], dtype="cat", cat_stats=("sum",))
print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")
#bureau_agg = bureau_agg.fillna(0)

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

In [None]:
# aggregate categorical columns with mean

bureau_agg = aggregate(bureau.drop(["SK_ID_BUREAU"], axis=1), 
                       by=["SK_ID_CURR"], dtype="cat", cat_stats=("mean",))
print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

In [None]:
# aggregate categorical columns with number of unique values

bureau_agg = aggregate(bureau.drop(["SK_ID_BUREAU"], axis=1), 
                       by=["SK_ID_CURR"], dtype="cat", cat_stats=("nunique",), onehot_encode=False)
print("bureau_agg shape:", bureau_agg.shape)

bureau_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_agg, how="left", on="SK_ID_CURR")
bureau_agg = bureau_agg.fillna(0)

corrs = corrwith(bureau_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_agg["TARGET"])
corrs

In [None]:
# aggregate categorical columns with mode

bureau_agg = aggregate(bureau.drop(["SK_ID_BUREAU"], axis=1), 
                       by=["SK_ID_CURR"], dtype="cat", cat_stats=(mode,), onehot_encode=False)
print("bureau_agg shape:", bureau_agg.shape)
bureau_agg.head()

### Engineered features

#### Count and percent of number of times DPD are over 1, 3, and 6 months

In [None]:
# agg by mean

credit_day_overdue = bureau[["SK_ID_CURR"]]
credit_day_overdue["CREDIT_DAY_OVERDUE_OVER_0M"] = bureau["CREDIT_DAY_OVERDUE"] == 0
credit_day_overdue["CREDIT_DAY_OVERDUE_OVER_1M"] = bureau["CREDIT_DAY_OVERDUE"] > 30
credit_day_overdue["CREDIT_DAY_OVERDUE_OVER_3M"] = bureau["CREDIT_DAY_OVERDUE"] > 90
credit_day_overdue["CREDIT_DAY_OVERDUE_OVER_6M"] = bureau["CREDIT_DAY_OVERDUE"] > 120

credit_day_overdue_agg = aggregate(credit_day_overdue, by=["SK_ID_CURR"], cat_stats=("mean",))
print("credit_day_overdue_agg shape:", credit_day_overdue_agg.shape)

credit_day_overdue_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(credit_day_overdue_agg, 
                                                                           how="left", on="SK_ID_CURR")

corrs = corrwith(credit_day_overdue_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), credit_day_overdue_agg["TARGET"])
corrs

In [None]:
# agg by sum

credit_day_overdue_agg = aggregate(credit_day_overdue, by=["SK_ID_CURR"], cat_stats=("sum",))
print("credit_day_overdue_agg shape:", credit_day_overdue_agg.shape)

credit_day_overdue_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(credit_day_overdue_agg, 
                                                                           how="left", on="SK_ID_CURR")
credit_day_overdue_agg = credit_day_overdue_agg.fillna(0)
corrs = corrwith(credit_day_overdue_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), credit_day_overdue_agg["TARGET"])
corrs

#### Whether `DAYS_CREDIT_ENDDATE` is positive

In [None]:
# agg by mean

days_credit_enddate_pos = bureau[["SK_ID_CURR"]]
days_credit_enddate_pos["DAYS_CREDIT_ENDDATE_POS"] = bureau["DAYS_CREDIT_ENDDATE"] > 0

days_credit_enddate_pos_agg = aggregate(days_credit_enddate_pos, by=["SK_ID_CURR"], cat_stats=("mean",))

print("days_credit_enddate_pos_agg shape:", days_credit_enddate_pos_agg.shape)

days_credit_enddate_pos_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(days_credit_enddate_pos_agg, 
                                                                           how="left", on="SK_ID_CURR")

corrs = corrwith(days_credit_enddate_pos_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), 
                 days_credit_enddate_pos_agg["TARGET"])
corrs

In [None]:
# agg by sum
days_credit_enddate_pos_agg = aggregate(days_credit_enddate_pos, by=["SK_ID_CURR"], cat_stats=("sum",))

print("days_credit_enddate_pos_agg shape:", days_credit_enddate_pos_agg.shape)

days_credit_enddate_pos_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(days_credit_enddate_pos_agg, 
                                                                           how="left", on="SK_ID_CURR")
days_credit_enddate_pos_agg = days_credit_enddate_pos_agg.fillna(0)

corrs = corrwith(days_credit_enddate_pos_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), 
                 days_credit_enddate_pos_agg["TARGET"])
corrs

#### Whether `DAYS_CREDIT_UPDATE` is positive

In [None]:
# very few cases 
(bureau["DAYS_CREDIT_UPDATE"] > 0).sum()

### Debt to Credit ratio and Total overdue to debt ratio

In [None]:
amt_agg = aggregate(bureau[["SK_ID_CURR", "AMT_CREDIT_SUM", "AMT_CREDIT_SUM_DEBT", "AMT_CREDIT_SUM_OVERDUE"]],
                   by=["SK_ID_CURR"], num_stats=("sum",))

amt_agg["DEBT_TO_CREDIT"] = amt_agg["AMT_CREDIT_SUM_DEBT_sum"] / amt_agg["AMT_CREDIT_SUM_sum"]
amt_agg["OVERDUE_TO_DEBT"] = amt_agg["AMT_CREDIT_SUM_OVERDUE_sum"] / amt_agg["AMT_CREDIT_SUM_DEBT_sum"]

amt_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(amt_agg, how="left", on="SK_ID_CURR")

amt_agg["DEBT_TO_CREDIT"] = amt_agg["DEBT_TO_CREDIT"].replace({np.inf: np.nan, -np.inf: np.nan})

amt_agg["OVERDUE_TO_DEBT"] = amt_agg["OVERDUE_TO_DEBT"].replace({np.inf: np.nan, -np.inf: np.nan})

corrs = corrwith(amt_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), amt_agg["TARGET"])
corrs

#### Time between successive loans

In [None]:
time_bet_loans = aggregate(bureau[["SK_ID_CURR", "DAYS_CREDIT"]], by=["SK_ID_CURR"], 
                           num_stats=(mean_diff, var_diff, range_diff,))
time_bet_loans = application_train[["SK_ID_CURR", "TARGET"]].merge(time_bet_loans, how="left", on="SK_ID_CURR")

In [None]:
for col in time_bet_loans.columns:
    time_bet_loans[col] = time_bet_loans[col].replace({np.inf: np.nan, -np.inf: np.nan})
    
corrs = corrwith(time_bet_loans.drop(["SK_ID_CURR", "TARGET"], axis=1), time_bet_loans["TARGET"])
corrs

# 3. `bureau_balance`

In [None]:
bureau_balance = pd.read_csv(os.path.join(INP_DIR, "bureau_balance.csv"))
bureau_balance = change_dtypes(bureau_balance)

print("bureau_balance.shape:", bureau_balance.shape)
bureau_balance.head()

In [None]:
print("MONTHS_BALANCE min:", bureau_balance["MONTHS_BALANCE"].min())
print("MONTHS_BALANCE max:", bureau_balance["MONTHS_BALANCE"].max())

print("STATUS unique values:", bureau_balance["STATUS"].unique())

## Missing values

In [None]:
missing_vals = bureau_balance.isnull().mean()
missing_vals = missing_vals.sort_values(ascending=False)
print("Total number of features:", bureau_balance.shape[1])
print("Number of features having missing values:", (missing_vals > 0).sum())

print("head:")
display(missing_vals.head(10))

## Aggregate once

In [None]:
bureau_balance_merge = application_train[["SK_ID_CURR"]].merge(bureau[["SK_ID_CURR", "SK_ID_BUREAU"]], 
                                                                      how="left", on="SK_ID_CURR")

bureau_balance_merge = bureau_balance_merge.merge(bureau_balance, how="left", on="SK_ID_BUREAU")
bureau_balance_merge = bureau_balance_merge.drop(["SK_ID_BUREAU"], axis=1)

bureau_balance_agg = aggregate(bureau_balance_merge, by=["SK_ID_CURR"], 
                               num_stats=("count", "mean", "min", "max", np.var),
                              cat_stats=("sum", "mean"))

bureau_balance_agg_uniq = aggregate(bureau_balance_merge, by=["SK_ID_CURR"], dtype="cat", 
                                    cat_stats=("nunique",))

bureau_balance_agg = bureau_balance_agg.merge(bureau_balance_agg_uniq, how="outer", on="SK_ID_CURR")

print("bureau_balance_agg shape:", bureau_balance_agg.shape)

bureau_balance_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_balance_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_balance_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_balance_agg["TARGET"])
corrs

## Aggregate stepwise

In [None]:
bureau_balance_agg = aggregate(bureau_balance, by=["SK_ID_BUREAU"], 
                               num_stats=("count", "mean", "min", "max", np.var),
                               cat_stats=("sum", "mean"))

bureau_balance_agg_uniq = aggregate(bureau_balance, by=["SK_ID_BUREAU"], dtype="cat",
                                    cat_stats=("nunique",), onehot_encode=False)

bureau_balance_agg = bureau_balance_agg.merge(bureau_balance_agg_uniq, how="outer", on="SK_ID_BUREAU")

bureau_balance_agg = bureau[["SK_ID_CURR", "SK_ID_BUREAU"]].merge(bureau_balance_agg, how="left", on="SK_ID_BUREAU")

bureau_balance_agg = bureau_balance_agg.drop(["SK_ID_BUREAU"], axis=1)

bureau_balance_agg = aggregate(bureau_balance_agg, by=["SK_ID_CURR"], 
                              num_stats=("count", "mean", "min", "max", np.var),
                              cat_stats=("sum", "mean"))

print("bureau_balance_agg shape:", bureau_balance_agg.shape)

bureau_balance_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_balance_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(bureau_balance_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), bureau_balance_agg["TARGET"])
corrs

# 4. `previous_application`

In [None]:
previous_application = pd.read_csv(os.path.join(INP_DIR, "previous_application.csv"))
previous_application = change_dtypes(previous_application)

print("previous_application shape:", previous_application.shape)
previous_application.head()

## Missing values
`RATE_INTEREST_PRIVILEGED` and `RATE_INTEREST_PRIMARY` have more than 99% missing values. We can drop these two columns.

In [None]:
missing_vals = previous_application.isnull().mean()
missing_vals = missing_vals.sort_values(ascending=False)
print("Total number of features:", previous_application.shape[1])
print("Number of features having missing values:", (missing_vals > 0).sum())

print("head:")
display(missing_vals.head(20))

previous_application = previous_application.drop(["RATE_INTEREST_PRIVILEGED", "RATE_INTEREST_PRIMARY"], axis=1)

`RATE_INTEREST_PRIVILEGED` and `RATE_INTEREST_PRIMARY` have more than 99% missing values. We can drop these two columns.

## Aggregate numerical columns

In [None]:
# agg by count
previous_application_agg = aggregate(previous_application.drop(["SK_ID_PREV"], axis=1), 
                                     by=["SK_ID_CURR"], dtype="num",
                                     num_stats=("count",))

print("previous_application_agg shape:", previous_application_agg.shape)

previous_application_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(previous_application_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(previous_application_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), previous_application_agg["TARGET"])
corrs

In [None]:
# agg by mean
previous_application_agg = aggregate(previous_application.drop(["SK_ID_PREV"], axis=1), 
                                     by=["SK_ID_CURR"], dtype="num",
                                     num_stats=("mean",))

print("previous_application_agg shape:", previous_application_agg.shape)

previous_application_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(previous_application_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(previous_application_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), previous_application_agg["TARGET"])
corrs

In [None]:
# agg by min
previous_application_agg = aggregate(previous_application.drop(["SK_ID_PREV"], axis=1), 
                                     by=["SK_ID_CURR"], dtype="num",
                                     num_stats=("min",))

print("previous_application_agg shape:", previous_application_agg.shape)

previous_application_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(previous_application_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(previous_application_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), previous_application_agg["TARGET"])
corrs

In [None]:
# agg by max
previous_application_agg = aggregate(previous_application.drop(["SK_ID_PREV"], axis=1), 
                                     by=["SK_ID_CURR"], dtype="num",
                                     num_stats=("max",))

print("previous_application_agg shape:", previous_application_agg.shape)

previous_application_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(previous_application_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(previous_application_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), previous_application_agg["TARGET"])
corrs

In [None]:
# agg by var
previous_application_agg = aggregate(previous_application.drop(["SK_ID_PREV"], axis=1), 
                                     by=["SK_ID_CURR"], dtype="num",
                                     num_stats=(np.var,))

print("previous_application_agg shape:", previous_application_agg.shape)

previous_application_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(previous_application_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(previous_application_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), previous_application_agg["TARGET"])
corrs

## Aggregate categorical columns

In [None]:
cat_cols = previous_application.select_dtypes(["category", "object", "bool"]).columns
for col in cat_cols:
    print(col, previous_application[col].nunique())

cat_cols_1 = [col for col in cat_cols if previous_application[col].nunique() <= 5]
cat_cols_2 = [col for col in cat_cols if previous_application[col].nunique() > 5]

print("cat_cols_1", cat_cols_1)
print("cat_cols_2", cat_cols_2)

In [None]:
# only categorical columns having nunique less than or equal to 5
# agg by sum and mean
previous_application_agg = aggregate(previous_application.drop(cat_cols_2, axis=1), 
                                     by=["SK_ID_CURR"], dtype="cat", cat_stats=("sum", "mean"))


print("previous_application_agg shape:", previous_application_agg.shape)

previous_application_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(previous_application_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(previous_application_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), previous_application_agg["TARGET"])
corrs

In [None]:
# only categorical columns having nunique less than or equal to 5
# agg by nunique
previous_application_agg = aggregate(previous_application.drop(cat_cols_2, axis=1), 
                                     by=["SK_ID_CURR"], dtype="cat", cat_stats=("nunique",), onehot_encode=False)

print("previous_application_agg shape:", previous_application_agg.shape)

previous_application_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(previous_application_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(previous_application_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), previous_application_agg["TARGET"])
corrs

In [None]:
# only categorical columns having nunique less than or equal to 5
# agg by sum and mean
previous_application_agg = aggregate(previous_application.drop(cat_cols_1, axis=1), 
                                     by=["SK_ID_CURR"], dtype="cat", cat_stats=("sum", "mean"))


print("previous_application_agg shape:", previous_application_agg.shape)

previous_application_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(previous_application_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(previous_application_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), previous_application_agg["TARGET"])
corrs

In [None]:
# only categorical columns having nunique less than or equal to 5
# agg by nunique
previous_application_agg = aggregate(previous_application.drop(cat_cols_1, axis=1), 
                                     by=["SK_ID_CURR"], dtype="cat", cat_stats=("nunique",), onehot_encode=False)

print("previous_application_agg shape:", previous_application_agg.shape)

previous_application_agg = application_train[["SK_ID_CURR", "TARGET"]].merge(previous_application_agg, 
                                                                       how="left", on="SK_ID_CURR")

corrs = corrwith(previous_application_agg.drop(["SK_ID_CURR", "TARGET"], axis=1), previous_application_agg["TARGET"])
corrs

We will similarly aggregate `POS_CASH_balance`, `credit_card_balance` and `installments_payments` tables.