In [1]:
import os
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import scipy.cluster.hierarchy as spc

pio.templates.default = "plotly_white"

In [2]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import functions as func

# Load data

In [3]:
default = (
    pd.read_csv(
        "../data/train/X_train.csv",
        index_col=0,
        usecols=[
            "row_id",
            "account_amount_added_12_24m",
            "sum_capital_paid_account_0_12m",
            "sum_capital_paid_account_12_24m",
            "account_incoming_debt_vs_paid_0_24m",
            "num_unpaid_bills",
            "sum_paid_inv_0_12m",
            "max_paid_inv_0_12m",
            "max_paid_inv_0_24m",
            "num_active_inv",
            "num_active_div_by_paid_inv_0_12m",
        ],
    )
    .join(pd.read_csv("../data/train/y_train.csv", index_col=0))
    .query("default == 1")
)

not_default = (
    pd.read_csv(
        "../data/train/X_train.csv",
        index_col=0,
        usecols=[
            "row_id",
            "account_amount_added_12_24m",
            "sum_capital_paid_account_0_12m",
            "sum_capital_paid_account_12_24m",
            "account_incoming_debt_vs_paid_0_24m",
            "num_unpaid_bills",
            "sum_paid_inv_0_12m",
            "max_paid_inv_0_12m",
            "max_paid_inv_0_24m",
            "num_active_inv",
            "num_active_div_by_paid_inv_0_12m",
        ],
    )
    .join(pd.read_csv("../data/train/y_train.csv", index_col=0))
    .query("default == 0")
)

df = pd.read_csv(
    "../data/train/X_train.csv",
    index_col=0,
).join(pd.read_csv("../data/train/y_train.csv", index_col=0))

## Correlation between "account" variables

As shown in our sanity profile report, account variables have high correlation amongst themselves mainly due to overlaping lookback windows for aggregation. The first thing we must do is choose one (or some) of them to represent the group, which can be achieved by correlation clustering.

In [4]:
corr = pd.concat([default, not_default]).corr()
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        z=corr.mask(mask),
        x=corr.columns,
        y=corr.columns,
        colorscale=px.colors.diverging.RdBu,
        zmin=-1,
        zmax=1,
    )
)

fig.update_layout(
    title="Correlation between 'account' variables",
    yaxis_autorange="reversed",
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    width=1000,
    height=500,
)

# fig.update_traces(opacity=0.6)
fig.show()

In [5]:
pdist = spc.distance.pdist(corr)
linkage = spc.linkage(pdist, method="single")
idx = spc.fcluster(linkage, 0.5 * pdist.max(), "distance")

columns = [default.columns.tolist()[i] for i in list((np.argsort(idx)))]
clusterd_corr = pd.concat([default, not_default]).reindex(columns, axis=1).corr()

mask = np.zeros_like(clusterd_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

corr["default"].to_frame(name="corr_with_label").assign(cluster=idx).drop(
    "default"
).sort_values(["cluster", "corr_with_label"], ascending=[True, False])

Unnamed: 0,corr_with_label,cluster
account_amount_added_12_24m,0.006217,1
sum_capital_paid_account_12_24m,0.005424,1
sum_capital_paid_account_0_12m,0.002648,1
num_unpaid_bills,0.028196,2
num_active_inv,0.01878,2
sum_paid_inv_0_12m,-0.034797,2
max_paid_inv_0_12m,-0.041735,2
max_paid_inv_0_24m,-0.042666,2
num_active_div_by_paid_inv_0_12m,0.137125,3
account_incoming_debt_vs_paid_0_24m,-0.000125,5


Despite our knowing that correlation is not equal to causation, the lack of proper time for exploration leads us to the not-so-ideal path of using correlation to exclude variables. At this time. we choose to analyze only the variable "num_active_div_by_paid_inv_0_12m" at the cost of probably losing some good insights.

## 1. "num_active_div_by_paid_inv_0_12m"

This variable has 22.8% of missing values and 48.7% of zero values, on top of some hard-hitting outliers.
One behaviour this variable describes is whether the customer has had a good experience with us and has come back for more. On the flip side, users that have not finished paying their first invoice yet (in a 12 month span) are attributed NaN simple because division by zero is undefined. Hence, a good customer will get NaN on their first purchase, but also an ill-intentioned customer that executes multiple orders with no intention to pay them. Therefore, we choose to fill missing values with the number of active invoices so that the latter can become visible through this variable.

Next, we look at how defaults are distributed across the categories.


In [6]:
df[["num_active_div_by_paid_inv_0_12m"]].assign(
    num_active_div_by_paid_inv_0_12m=df[
        "num_active_div_by_paid_inv_0_12m"
    ].combine_first(df["num_active_inv"])
).describe(np.append(np.arange(0.25, 1.0, 0.1), np.array([0.99])))

Unnamed: 0,num_active_div_by_paid_inv_0_12m
count,71980.0
mean,0.112705
std,0.310636
min,0.0
25%,0.0
35%,0.0
45.0%,0.0
50%,0.0
55.0%,0.0
65.0%,0.0


In [7]:
_, var_profile = func.test_k_prop(
    df[["default"]]
    .assign(
        var=df["num_active_div_by_paid_inv_0_12m"].combine_first(df["num_active_inv"]),
        bins_var=lambda frame: pd.cut(frame["var"], range(-1, 10)),
    )
    .groupby("bins_var")["default"]
    .agg(["sum", "count"])
    .assign(
        default=lambda df: df["sum"], not_default=lambda df: df["count"] - df["sum"]
    )[["default", "not_default", "count"]]
    .transform(lambda s: s.astype(int))
)

var_profile

Using 9 degrees of freedom
Reject null hypothesis with 1000.092329327441 > 16.91897760462045


Unnamed: 0_level_0,default,not_default,count,expected_default,expected_not_default,chi_default,chi_not_default
bins_var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(-1, 0]",620,49350,49970,714.362939,49255.637061,12.464762,0.180779
"(0, 1]",329,21040,21369,305.487726,21063.512274,1.809654,0.026246
"(1, 2]",50,475,525,7.505314,517.494686,240.602635,3.489501
"(2, 3]",14,66,80,1.143667,78.856333,144.522238,2.096031
"(3, 4]",6,12,18,0.257325,17.742675,128.1582,1.8587
"(4, 5]",5,5,10,0.142958,9.857042,165.019052,2.3933
"(5, 6]",1,2,3,0.042888,2.957112,21.3597,0.309783
"(6, 7]",1,0,1,0.014296,0.985704,67.964733,0.985704
"(7, 8]",1,0,1,0.014296,0.985704,67.964733,0.985704
"(8, 9]",2,0,2,0.028592,1.971408,135.929466,1.971408


In [8]:
fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=var_profile.reset_index().index.to_list(),
        y=(var_profile["not_default"] / var_profile["count"]),
        name="not_default",
    )
)

fig.add_trace(
    go.Bar(
        x=var_profile.reset_index().index.to_list(),
        y=(var_profile["default"] / var_profile["count"]),
        name="default",
    )
)

fig.update_layout(
    title="Distribution of observations across 'num_active_div_by_paid_inv_0_12m' classes",
    barmode="relative",
    yaxis_title="Percentage",
    yaxis_tickformat="%",
    xaxis_title="Ordered Bins",
    xaxis_tickvals=[x for x in range(0, var_profile.shape[0])],
)

fig.update_traces(opacity=0.75)
fig.show()

Let's take a look at the impact of our imputing by printing the number of new cases being considered for each category

In [9]:
(
    df[["num_active_div_by_paid_inv_0_12m", "default"]]
    .assign(
        # num_active_div_by_paid_inv_0_12m=df["num_active_div_by_paid_inv_0_12m"].combine_first(df["num_active_inv"]),
        bins_var=pd.cut(df["num_active_div_by_paid_inv_0_12m"], range(-1, 10))
    )
    .groupby("bins_var")["default"]
    .agg(["sum", "count"])
    .assign(
        default=lambda df: df["sum"], not_default=lambda df: df["count"] - df["sum"]
    )[["default", "not_default", "count"]]
    .transform(lambda s: s.astype(int))
    .assign(
        default=lambda frame: var_profile["default"] - frame["default"],
        not_default=lambda frame: var_profile["not_default"] - frame["not_default"],
        count=lambda frame: var_profile["count"] - frame["count"],
    )
)

Unnamed: 0_level_0,default,not_default,count
bins_var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(-1, 0]",337,14562,14899
"(0, 1]",84,1291,1375
"(1, 2]",21,113,134
"(2, 3]",2,19,21
"(3, 4]",3,4,7
"(4, 5]",0,1,1
"(5, 6]",0,0,0
"(6, 7]",0,0,0
"(7, 8]",0,0,0
"(8, 9]",0,0,0


We see that most new observations due to imputing fall in the first bin (equals zero). However, percentagewise, the increase of default cases in categories that hold less observations is significantly higher, which indicates that our imputing strategy was indeed a good idea.

Now, let's look at the impact of transforming this variable into a binary one.

In [10]:
_, bool_var_profile = func.test_k_prop(
    df[["default"]]
    .assign(
        var=df["num_active_div_by_paid_inv_0_12m"].combine_first(df["num_active_inv"]),
        bins_var=lambda frame: pd.cut(frame["var"], [-1, 1, np.inf]),
    )
    .groupby("bins_var")["default"]
    .agg(["sum", "count"])
    .assign(
        default=lambda df: df["sum"], not_default=lambda df: df["count"] - df["sum"]
    )[["default", "not_default", "count"]]
    .transform(lambda s: s.astype(int))
)

bool_var_profile

Using 1 degrees of freedom
Reject null hypothesis with 575.7626090477304 > 3.8414588206941285


Unnamed: 0_level_0,default,not_default,count,expected_default,expected_not_default,chi_default,chi_not_default
bins_var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"(-1.0, 1.0]",949,70390,71339,1020.827591,70318.172409,5.053941,0.073369
"(1.0, inf]",81,560,641,9.172409,631.827591,562.469775,8.165523


It is interesting that the contamination rate in the "bellow 1" class is very close to the dataset as a whole (~0.014), but the "above 1" class gets a nearly 10-fold increase in its contamination (~.13). It seems that transforming this variable into boolean will still capture some of the "default" behaviour whilst reducing the trouble with the Curse of Dimensionality.

How does this new variable correlate to the target label?

In [11]:
df[["default"]].assign(
    var=df["num_active_div_by_paid_inv_0_12m"].combine_first(df["num_active_inv"]),
    above_1=lambda frame: (frame["var"] > 1).astype(float),
).corr()["default"].to_frame(name="corr_with_label").drop("default")

Unnamed: 0,corr_with_label
var,0.125188
above_1,0.089437


It is to be expected that reducing granularity will also reduce power. However, we must remind ourselves that correlation is not causation, and that the trade-off should be worth it. In conclusion, we decide to move forward with the boolean (above 1?) equivalent of the "num_active_div_by_paid_inv_0_12m" variable.

Next, we look at "archived" variables.