In [1]:
import os
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import scipy.cluster.hierarchy as spc

pio.templates.default = "plotly_white"

In [2]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import functions as func

# Load data

In [3]:
default = (
    pd.read_csv(
        "../data/train/X_train.csv",
        index_col=0,
        usecols=[
            "row_id",
            "account_status",
            "account_worst_status_0_3m",
            "account_worst_status_12_24m",
            "account_worst_status_3_6m",
            "account_worst_status_6_12m",
            "status_2nd_last_archived_0_24m",
            "status_3rd_last_archived_0_24m",
            "status_last_archived_0_24m",
            "status_max_archived_0_12_months",
            "status_max_archived_0_24_months",
            "status_max_archived_0_6_months",
            "account_days_in_dc_12_24m",
            "account_days_in_rem_12_24m",
            "account_days_in_term_12_24m",
        ],
    )
    .join(pd.read_csv("../data/train/y_train.csv", index_col=0))
    .query("default == 1")
)

not_default = (
    pd.read_csv(
        "../data/train/X_train.csv",
        index_col=0,
        usecols=[
            "row_id",
            "account_status",
            "account_worst_status_0_3m",
            "account_worst_status_12_24m",
            "account_worst_status_3_6m",
            "account_worst_status_6_12m",
            "status_2nd_last_archived_0_24m",
            "status_3rd_last_archived_0_24m",
            "status_last_archived_0_24m",
            "status_max_archived_0_12_months",
            "status_max_archived_0_24_months",
            "status_max_archived_0_6_months",
            "account_days_in_dc_12_24m",
            "account_days_in_rem_12_24m",
            "account_days_in_term_12_24m",
        ],
    )
    .join(pd.read_csv("../data/train/y_train.csv", index_col=0))
    .query("default == 0")
)

## Correlation between status variables

As shown in our sanity profile report, status variables have high correlation amongst themselves mainly due to overlaping lookback windows for aggregation. The first thing we must do is choose one (or some) of them to represent the group, which can be achieved by correlation clustering.

In [4]:
corr = pd.concat([default, not_default]).corr()
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        z=corr.mask(mask),
        x=corr.columns,
        y=corr.columns,
        colorscale=px.colors.diverging.RdBu,
        zmin=-1,
        zmax=1,
    )
)

fig.update_layout(
    title="Correlation between 'status' variables",
    yaxis_autorange="reversed",
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    width=1000,
    height=500,
)

# fig.update_traces(opacity=0.6)
fig.show()

In [12]:
pdist = spc.distance.pdist(corr)
linkage = spc.linkage(pdist, method="single")
idx = spc.fcluster(linkage, 0.5 * pdist.max(), "distance")

columns = [default.columns.tolist()[i] for i in list((np.argsort(idx)))]
clusterd_corr = pd.concat([default, not_default]).reindex(columns, axis=1).corr()

mask = np.zeros_like(clusterd_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

corr["default"].to_frame(name="corr_with_label").assign(cluster=idx).drop(
    "default"
).sort_values(["cluster", "corr_with_label"], ascending=[True, False])

Unnamed: 0,corr_with_label,cluster
status_last_archived_0_24m,0.043865,1
status_max_archived_0_24_months,0.025782,1
status_max_archived_0_12_months,0.005628,1
status_2nd_last_archived_0_24m,0.003872,1
status_3rd_last_archived_0_24m,-0.016915,1
status_max_archived_0_6_months,-0.028631,1
account_worst_status_6_12m,0.164674,2
account_worst_status_0_3m,0.15896,2
account_worst_status_12_24m,0.157874,2
account_worst_status_3_6m,0.157014,2


As we can see, our correlation clustering strategy yields 2 clusters of variables. The variables from each cluster that best correlate with the target label are "status_last_archived_0_24m" and "account_worst_status_6_12m". By and large, the first cluster has very little correlation with the target label, so we will keep its variable whilst keeping a close watch on whether it decreases inference performance.

> Note that any correlation with the target label is very loosely accurate as the label is boolean, not numeric. Despite there being better ways, we consider this a "good enough" approach to choose the representative of each cluster.

There is one thing we can do that should improve even further how much the second cluster explains the target, which is to aggregate its variables into one. In fact, we will create one that spans 12 months and another that spans all 24 months and check how they correlate to the target.

In [6]:
pd.concat([default, not_default]).assign(
    account_worst_status_0_12m=lambda df: df[
        [
            "account_worst_status_0_3m",
            "account_worst_status_3_6m",
            "account_worst_status_6_12m",
        ]
    ]
    .max(axis=1)
    .combine_first(df.assign(dummy=1).dummy),
    account_worst_status_0_24m=lambda df: df[
        [
            "account_worst_status_0_3m",
            "account_worst_status_3_6m",
            "account_worst_status_6_12m",
            "account_worst_status_12_24m",
        ]
    ]
    .max(axis=1)
    .combine_first(df.assign(dummy=1).dummy),
)[["account_worst_status_0_12m", "account_worst_status_0_24m", "default"]].corr()[
    "default"
].to_frame(
    "corr_with_target"
).drop(
    "default"
)

Unnamed: 0,corr_with_target
account_worst_status_0_12m,0.142014
account_worst_status_0_24m,0.139908


On top of aggregating features of the second cluster, we decided to impute missing values to 1 (the best status), which is in line with the principle of customer obssesion as we choose to give the customer the benefit of the doubt. This does slightly reduce the correlation index but since these columns have over 54% percent of missing data, the fact that imputing reduces less than 0.03 suggests that it was a good decision. We will consider only the previous 12 months as it was the better of the two

Next, we take advantage of K Proportions Theory once again to test the null hypothesis that "the probability with which defaults happen is the same across all categories" against the alternate hypothesis that "the probability with which defaults happen differs across some categories"

### 1. Test the null hypothesis for "status_last_archived_0_24m"

In [10]:
# It's worth noting that this variable has no missing values

arch_status_profile = (
    pd.concat(
        [
            default[["status_last_archived_0_24m", "default"]],
            not_default[["status_last_archived_0_24m", "default"]],
        ]
    )
    .groupby("status_last_archived_0_24m")
    .agg(
        default=("default", "sum"),
        not_default=("default", func.complement),
        count=("default", "count"),
    )
    .transform(lambda s: s.astype(int))
)


_, arch_status_profile_chi2 = func.test_k_prop(arch_status_profile)

arch_status_profile_chi2

Using 3 degrees of freedom
Reject null hypothesis with 2207.2045706697118 > 7.814727903251178


Unnamed: 0_level_0,default,not_default,count,expected_default,expected_not_default,chi_default,chi_not_default
status_last_archived_0_24m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,383,13047,13430,192.176994,13237.823006,189.478559,2.750711
1,311,51495,51806,741.319533,51064.680467,249.790937,3.626281
2,156,5461,5617,80.376632,5536.623368,71.151199,1.032921
3,180,947,1127,16.126841,1110.873159,1665.199816,24.174148


These results follow the common sense that the probability of default varies accordingly to the status of one's previous purchase. We believe that a shorter span, perhaps 6 months, would make for a better predictor for the target. However, this is just an intuition that needs testing.

We choose to include this variable in our effort to select the best features for our models.

---

### 2. Test the null hypothesis for "account_worst_status_0_12m"

In [9]:
agg_worst_status = lambda df: df.assign(
    account_worst_status_0_12m=lambda df: df[
        [
            "account_worst_status_0_3m",
            "account_worst_status_3_6m",
            "account_worst_status_6_12m",
        ]
    ]
    .max(axis=1)
    .combine_first(df.assign(dummy=1).dummy)
)

worst_status_profile = (
    pd.concat(
        [
            agg_worst_status(default)[["account_worst_status_0_12m", "default"]],
            agg_worst_status(not_default)[["account_worst_status_0_12m", "default"]],
        ]
    )
    .groupby("account_worst_status_0_12m")
    .agg(
        default=("default", "sum"),
        not_default=("default", func.complement),
        count=("default", "count"),
    )
    .transform(lambda s: s.astype(int))
)

_, worst_status_profile_chi2 = func.test_k_prop(worst_status_profile)

worst_status_profile_chi2

Using 3 degrees of freedom
Reject null hypothesis with 1848.8880186293527 > 7.814727903251178


Unnamed: 0_level_0,default,not_default,count,expected_default,expected_not_default,chi_default,chi_not_default
account_worst_status_0_12m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.0,578,61597,62175,889.695054,61285.304946,109.198996,1.585271
2.0,289,8305,8594,122.976104,8471.023896,224.140568,3.253908
3.0,103,864,967,13.837316,953.162684,574.532249,8.340637
4.0,60,184,244,3.491525,240.508475,914.559487,13.276903


Results show that the probability with which defaults happen differs across categories in the "worst status" variable. This is rather intuitive, as a prior case of default could be explained by some personality traits, economic status and other exogenous variables. However, money shortage due to no fault of one's own also plays a considerable part, as it has saddly become more evident with the Covid-19 pandemic. All in all, we choose to employ the variable to induce our models but warn of the risk of irresponsible social profiling that could reverb the already existing overall prejudice and racism against minorities and the poor.

--

To sum up, we show that "status_last_archived_0_24m" and "account_worst_status_0_12m" are good candidates for features. Hence, we move forward with 2 candidates for features from "status" variables:

- status_last_archived_0_24m
- account_worst_status_0_12m

Next, we look at "account" variables.