In [115]:
import os
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.cluster.hierarchy as spc

pio.templates.default = "plotly_white"

In [116]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import functions as func

# Load data

In [117]:
default = (
    pd.read_csv(
        "../data/train/X_train.csv",
        index_col=0,
        usecols=[
            "row_id",
            "merchant_category",
            "merchant_group",
        ],
    )
    .join(pd.read_csv("../data/train/y_train.csv", index_col=0))
    .query("default == 1")
)

not_default = (
    pd.read_csv(
        "../data/train/X_train.csv",
        index_col=0,
        usecols=[
            "row_id",
            "merchant_category",
            "merchant_group",
        ],
    )
    .join(pd.read_csv("../data/train/y_train.csv", index_col=0))
    .query("default == 0")
)

df = pd.read_csv(
    "../data/train/X_train.csv",
    index_col=0,
).join(pd.read_csv("../data/train/y_train.csv", index_col=0))

## Overview

Both "merchant_category" and "merchant_group" are highly correlated. We must explore whether they can help us predict defaults and the first step is to see how they relate:

In [163]:
def highlight_above_mean(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_above_mean = s > s.mean()
    return ['background-color: yellow' if v else '' for v in is_above_mean]


df.groupby(["merchant_group", "merchant_category"]).agg(
    customers=("uuid", "nunique"),
    default=("default", "sum"),
    not_default=("default", func.complement),
    cat_contamination=("default", lambda s: s.sum() / s.shape[0]),
).join(
    df.groupby("merchant_group").agg(group_contamination=("default", lambda s: s.sum() / s.shape[0])),
    on="merchant_group"
).style.apply(
    highlight_above_mean, subset=["cat_contamination", "group_contamination"]
).format(
    {
        "default": "{:.0f}",
        "not_default": "{:.0f}",
        "cat_contamination": "{:.2%}",
        "group_contamination": "{:.2%}"
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,customers,default,not_default,cat_contamination,group_contamination
merchant_group,merchant_category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Automotive Products,Automotive Parts & Accessories,594,7,587,1.18%,1.63%
Automotive Products,Wheels & Tires,82,4,78,4.88%,1.63%
Children Products,Children Clothes & Nurturing products,1082,15,1067,1.39%,1.24%
Children Products,Children toys,460,3,457,0.65%,1.24%
Children Products,Diversified children products,2171,28,2143,1.29%,1.24%
Clothing & Shoes,Adult Shoes & Clothing,224,5,219,2.23%,2.31%
Clothing & Shoes,General Shoes & Clothing,3359,56,3303,1.67%,2.31%
Clothing & Shoes,Underwear,61,2,59,3.28%,2.31%
Clothing & Shoes,Youthful Shoes & Clothing,8335,214,8121,2.57%,2.31%
Electronics,Car electronics,33,2,31,6.06%,2.38%


In the table above, we highlight categories

## 1. "merchant_group"

In [119]:
plot_data = df.groupby("merchant_group").agg(
    default=("default", lambda s: s.sum() / s.shape[0]),
    not_default=("default", lambda s: (s.shape[0] - s.sum()) / s.shape[0]),
    counts=("default", "count")
).sort_values("counts", ascending=False)

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=plot_data.index,
        y=plot_data.not_default,
        name="not_default",
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(
        x=plot_data.index,
        y=plot_data.default,
        name="default",
    ),
    secondary_y=False,
)

fig.update_traces(opacity=0.4)

fig.add_trace(
    go.Scatter(
        x=plot_data.index,
        y=plot_data.counts,
        name="observations",
        mode='lines+markers',
        line_color="indianred",
        line_width=4
    ),
    secondary_y=True,
)

fig.update_layout(
    title="Merchant Group",
    barmode="stack",
    yaxis_tickformat="%",
    yaxis_title="",
    width=1000,
    height=500
)

fig.update_yaxes(
    showgrid=False,
    secondary_y=False,
)

fig.update_yaxes(
    title_text="<b>Observations (log)</b>",
    secondary_y=True,
    type="log",
    range=[-1e-3, 4.8]
)

fig.show()

In [126]:
fig = px.parallel_categories(df[["merchant_group", "merchant_category"]], dimensions_max_cardinality=100)

fig.show()

## 1. "merchant_category"

This variable has 56 distinct categories, of which "Diversified entertainment" is by far the most common. We give an overview of the variable next:

In [118]:
plot_data = df.groupby("merchant_category").agg(
    default=("default", lambda s: s.sum() / s.shape[0]),
    not_default=("default", lambda s: (s.shape[0] - s.sum()) / s.shape[0]),
    counts=("default", "count")
).sort_values("counts", ascending=False)

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=plot_data.index,
        y=plot_data.not_default,
        name="not_default",
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(
        x=plot_data.index,
        y=plot_data.default,
        name="default",
    ),
    secondary_y=False,
)

fig.update_traces(opacity=0.4)

fig.add_trace(
    go.Scatter(
        x=plot_data.index,
        y=plot_data.counts,
        name="observations",
        mode='lines+markers',
        line_color="indianred",
        line_width=4
    ),
    secondary_y=True,
)

fig.update_layout(
    title="Merchant Category",
    barmode="stack",
    yaxis_tickformat="%",
    yaxis_title="",
    width=1500,
    height=600
)

fig.update_yaxes(
    showgrid=False,
    secondary_y=False,
)

fig.update_yaxes(
    title_text="<b>Observations (log)</b>",
    secondary_y=True,
    type="log",
    range=[-1e-3, 4.5]
)

fig.show()

The plot above shows two main ideas, the first (left yaxis) explores how defaults are distributed across categories, the second (right yaxis) explores the ubiquity of each category with our customers. A few broad observations:
- Diversified entertainment holds ~39% of all customers in the sample but sees only half the rate of defaults compared to the whole sample
- 11 categories have at least 1K customers
- 20 categories have less than 100 customers
- "Sex toys" and "Plants & Flowers" see more than 10% of their customers default, and "Tobacco" almost 15%
    - these merchants combined have less than 0.2% of our customers
- In the mid-range, "Dating services" see almost 10% of customers default

In [None]:
_, var_profile = func.test_k_prop(
    df[["default"]]
    .assign(
        var=df["num_active_div_by_paid_inv_0_12m"].combine_first(df["num_active_inv"]),
        bins_var=lambda frame: pd.cut(frame["var"], range(-1, 10)),
    )
    .groupby("bins_var")
    .agg(
        default=("default", "sum"),
        not_default=("default", func.complement),
        count=("default", "count"),
    )
    .transform(lambda s: s.astype(int))
)

var_profile

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=var_profile.reset_index().index.to_list(),
        y=(var_profile["not_default"] / var_profile["count"]),
        name="not_default",
    )
)

fig.add_trace(
    go.Bar(
        x=var_profile.reset_index().index.to_list(),
        y=(var_profile["default"] / var_profile["count"]),
        name="default",
    )
)

fig.update_layout(
    title="Distribution of observations across 'num_active_div_by_paid_inv_0_12m' classes",
    barmode="relative",
    yaxis_title="Percentage",
    yaxis_tickformat="%",
    xaxis_title="Ordered Bins",
    xaxis_tickvals=[x for x in range(0, var_profile.shape[0])],
)

fig.update_traces(opacity=0.75)
fig.show()

Let's take a look at the impact of our imputing by printing the number of new cases being considered for each category

In [None]:
(
    df[["num_active_div_by_paid_inv_0_12m", "default"]]
    .assign(
        # num_active_div_by_paid_inv_0_12m=df["num_active_div_by_paid_inv_0_12m"].combine_first(df["num_active_inv"]),
        bins_var=pd.cut(df["num_active_div_by_paid_inv_0_12m"], range(-1, 10))
    )
    .groupby("bins_var")
    .agg(
        default=("default", "sum"),
        not_default=("default", func.complement),
        count=("default", "count"),
    )
    .transform(lambda s: s.astype(int))
    .assign(
        default=lambda frame: var_profile["default"] - frame["default"],
        not_default=lambda frame: var_profile["not_default"] - frame["not_default"],
        count=lambda frame: var_profile["count"] - frame["count"],
    )
)

We see that most new observations due to imputing fall in the first bin (equals zero). However, percentagewise, the increase of default cases in categories that hold less observations is significantly higher, which indicates that our imputing strategy was indeed a good idea.

Now, let's look at the impact of transforming this variable into a binary one.

In [None]:
_, bool_var_profile = func.test_k_prop(
    df[["default"]]
    .assign(
        var=df["num_active_div_by_paid_inv_0_12m"].combine_first(df["num_active_inv"]),
        bins_var=lambda frame: pd.cut(frame["var"], [-1, 1, np.inf]),
    )
    .groupby("bins_var")
    .agg(
        default=("default", "sum"),
        not_default=("default", func.complement),
        count=("default", "count"),
    )
    .transform(lambda s: s.astype(int))
)

bool_var_profile

It is interesting that the contamination rate in the "bellow 1" class is very close to the dataset as a whole (~0.014), but the "above 1" class gets a nearly 10-fold increase in its contamination (~.13). It seems that transforming this variable into boolean will still capture some of the "default" behaviour whilst reducing the trouble with the Curse of Dimensionality.

How does this new variable correlate to the target label?

In [None]:
df[["default"]].assign(
    var=df["num_active_div_by_paid_inv_0_12m"].combine_first(df["num_active_inv"]),
    above_1=lambda frame: (frame["var"] > 1).astype(float),
).corr()["default"].to_frame(name="corr_with_label").drop("default")

It is to be expected that reducing granularity will also reduce power. However, we must remind ourselves that correlation is not causation, and that the trade-off should be worth it. In conclusion, we decide to move forward with the variable:
- num_active_div_by_paid_inv_0_12m_is_above_1


Next, we look at "archived" variables.